RubyGems - news2kindle - Versions diffs - 0.1.1 - Mend

news2kindle 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

checksums.yaml +7 -0
data/.gitignore +12 -0
data/.rspec +3 -0
data/.tachikoma.yml +1 -0
data/.travis.yml +18 -0
data/Gemfile +6 -0
data/Gemfile.lock +119 -0
data/README.md +59 -0
data/Rakefile +6 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/bin/test-generator +21 -0
data/exe/news2kindle +107 -0
data/lib/news2kindle.rb +12 -0
data/lib/news2kindle/dup_checker.rb +41 -0
data/lib/news2kindle/generator/internet-watch.rb +236 -0
data/lib/news2kindle/generator/nikkei-free.rb +18 -0
data/lib/news2kindle/generator/nikkei-paid.rb +352 -0
data/lib/news2kindle/generator/tdiary.rb +135 -0
data/lib/news2kindle/generator/wsj-paid.rb +360 -0
data/lib/news2kindle/generator/wsjus-paid.rb +90 -0
data/lib/news2kindle/task.rb +116 -0
data/lib/news2kindle/version.rb +3 -0
data/news2kindle.gemspec +37 -0
data/news2kindle.yaml.sample +31 -0
data/resource/internet-watch.css +27 -0
data/resource/internet-watch.jpg +0 -0
data/resource/nikkei.css +43 -0
data/resource/nikkei.jpg +0 -0
data/resource/tdiary.css +27 -0
data/resource/wsj-us.jpg +0 -0
data/resource/wsj.css +19 -0
data/resource/wsj.jpg +0 -0
metadata +245 -0

data/lib/news2kindle/generator/internet-watch.rb ADDED Viewed

@@ -0,0 +1,236 @@
+# scraping internet.watch.impress.co.jp for News2Kindle
+#
+require 'nokogiri'
+require 'open-uri'
+require 'uri'
+require 'ostruct'
+require 'tmpdir'
+require 'pathname'
+require 'fileutils'
+module News2Kindle
+	module Generator
+		class InternetWatch
+			TOP = 'https://internet.watch.impress.co.jp'
+			def initialize( tmpdir )
+				@current_dir = tmpdir
+				@src_dir = @current_dir + '/src'
+				Dir::mkdir( @src_dir )
+				@dst_dir = @current_dir + '/dst'
+				Dir::mkdir( @dst_dir )
+				FileUtils.cp( "./resource/internet-watch.jpg", @dst_dir )
+				FileUtils.cp( "./resource/internet-watch.css", @dst_dir )
+			end
+			def generate(opts)
+				now = opts[:now]
+				items = []
+				rdf_file = "http://rss.rssad.jp/rss/internetwatch/internet.rdf"
+				rdf = retry_loop( 5 ) do
+					Nokogiri(open(rdf_file, 'r:utf-8', &:read))
+				end
+				(rdf / 'item' ).each do |item|
+					uri = URI( item.attr( 'rdf:about' ).to_s )
+					next unless /internet\.watch\.impress\.co\.jp/ =~ uri.host
+					uri.query = nil # remove query of 'ref=rss'
+					next if News2Kindle::DupChecker.dup?(uri)
+					title = (item / 'title').text
+					date = item.elements.map{|e| e.text if e.name == 'date'}.join
+					items <<  OpenStruct::new( :uri => uri, :title => title, :date => date )
+				end
+				items.sort!{|a,b| a.date <=> b.date}
+				now_str = now.strftime( '%Y-%m-%d %H:%M' )
+				#
+				# generating articles in html
+				#
+				items.each do |item|
+					begin
+						article = get_article( item.uri )
+						open( "#{@dst_dir}/#{item_id item.uri}.html", 'w' ) do |f|
+							f.puts html_header( item.title )
+							contents = (article / 'div.mainContents')
+							(contents / 'img').each do |img|
+								org = img.attr('ajax') || img.attr('src')
+								next if org =~ /^http/ # skip images on other servers
+								begin
+									img_file = retry_loop( 5 ) do
+										open( "#{TOP}#{org}", &:read )
+									end
+									cache = "#{org.gsub( /\//, '_' ).sub( /^_/, '' )}"
+									open( "#{@dst_dir}/#{cache}", 'w' ){|f| f.write img_file}
+									img.set_attribute( 'src', cache )
+								rescue OpenURI::HTTPError
+									News2Kindle.logger.error "skipped an image: #{TOP}#{org}"
+								end
+							end
+							f.puts contents.inner_html
+							f.puts html_footer
+						end
+					rescue
+						News2Kindle.logger.warn "#{$!.class}: #$!"
+						News2Kindle.logger.warn "skipped an article: #{item.uri}"
+					end
+				end
+				#
+				# generating TOC in html
+				#
+				open( "#{@dst_dir}/toc.html", 'w:utf-8' ) do |f|
+					f.write html_header( 'Table of Contents' )
+					if items.size == 0
+						f.puts %Q|<p>本日は記事がありません。</p>|
+					else
+						f.puts "<ul>"
+						items.each do |item|
+							f.puts %Q|\t<li><a href="#{item_id item.uri}.html">#{item.title}</a></li>|
+							end
+						f.puts "</ul>"
+					end
+					f.write html_footer
+				end
+				#
+				# generating TOC in ncx
+				#
+				open( "#{@dst_dir}/toc.ncx", 'w:utf-8' ) do |f|
+					f.write <<-XML.gsub( /^\t/, '' )
+					<?xml version="1.0" encoding="UTF-8"?>
+					<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
+					<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
+					<docTitle><text>INTERNET Watch (#{now_str})</text></docTitle>
+					<navMap>
+						<navPoint id="toc" playOrder="0"><navLabel><text>Table of Contents</text></navLabel><content src="toc.html" /></navPoint>
+					XML
+					items.each_with_index do |item, index|
+						f.puts %Q|\t\t<navPoint id="#{item_id item.uri}" playOrder="#{index}"><navLabel><text>#{item.title}</text></navLabel><content src="#{item_id item.uri}.html" /></navPoint>|
+					end
+					f.write <<-XML.gsub( /^\t/, '' )
+					</navMap>
+					</ncx>
+					XML
+				end
+				#
+				# generating OPF
+				#
+				open( "#{@dst_dir}/internet-watch.opf", 'w:utf-8' ) do |f|
+					f.write <<-XML.gsub( /^\t/, '' )
+					<?xml version="1.0" encoding="utf-8"?>
+					<package unique-identifier="uid">
+						<metadata>
+							<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core" xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/">
+								<dc:Title>INTERNET Watch (#{now_str})</dc:Title>
+								<dc:Language>ja-JP</dc:Language>
+								<dc:Creator>インプレス</dc:Creator>
+								<dc:Description>INTERNET Watch、#{now_str}生成</dc:Description>
+								<dc:Date>#{now.strftime( '%d/%m/%Y' )}</dc:Date>
+							</dc-metadata>
+							<x-metadata>
+								<output encoding="utf-8" content-type="text/x-oeb1-document"></output>
+								<EmbeddedCover>internet-watch.jpg</EmbeddedCover>
+							</x-metadata>
+						</metadata>
+						<manifest>
+							<item id="toc" media-type="application/x-dtbncx+xml" href="toc.ncx"></item>
+							<item id="style" media-type="text/css" href="internet-watch.css"></item>
+							<item id="index" media-type="text/html" href="toc.html"></item>
+					XML
+					items.each do |item|
+						f.puts %Q|\t\t<item id="#{item_id item.uri}" media-type="text/html" href="#{item_id item.uri}.html"></item>|
+					end
+					f.write <<-XML.gsub( /^\t/, '' )
+					</manifest>
+					<spine toc="toc">
+						<itemref idref="index" />
+					XML
+					items.each do |item|
+						f.puts %Q|\t<itemref idref="#{item_id item.uri}" />\n|
+					end
+					f.write <<-XML.gsub( /^\t/, '' )
+					</spine>
+					<tours></tours>
+					<guide>
+					  <reference type="toc" title="Table of Contents" href="toc.html"></reference>
+					  <reference type="start" title="Table of Contents" href="toc.html"></reference>
+					</guide>
+					</package>
+					XML
+				end
+				yield "#{@dst_dir}/internet-watch.opf"
+			end
+		private
+			def retry_loop( times )
+				count = 0
+				begin
+					yield
+				rescue
+					count += 1
+					if count >= times
+						raise
+					else
+						News2Kindle.logger.error $!
+						News2Kindle.logger.info "#{count} retry."
+						sleep 1
+						retry
+					end
+				end
+			end
+			def item_id( uri )
+				File::basename( uri.path, '.html' )
+			end
+			def get_article( uri )
+				cache = "#{@src_dir}/#{File::basename uri.path}"
+				begin
+					html = open( cache, &:read )
+				rescue Errno::ENOENT
+					#puts "getting article: #{uri.path}".encode( Encoding::default_external )
+					html = retry_loop( 5 ) do
+						open( uri, &:read )
+					end
+					open( cache, 'w' ){|f| f.write html }
+				end
+				Nokogiri( html.encode 'UTF-8', invalid: :replace, undef: :replace, replace: '?' )
+			end
+			def html_header( title )
+				<<-HTML.gsub( /^\t/, '' )
+				<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+				<html>
+				<head>
+					<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></meta>
+					<title>#{title}</title>
+					<link rel="stylesheet" href="internet-watch.css" type="text/css" media="all"></link>
+				</head>
+				<body>
+					<h1>#{title}</h1>
+				HTML
+			end
+			def html_footer
+				<<-HTML.gsub( /^\t/, '' )
+				</body>
+				</html>
+				HTML
+			end
+		end
+	end
+end

data/lib/news2kindle/generator/nikkei-free.rb ADDED Viewed

@@ -0,0 +1,18 @@
+# scraping nikkei.com (for free user) for News2Kindle
+#
+require 'nokogiri'
+require 'open-uri'
+require 'tmpdir'
+require 'pathname'
+require  (File.dirname(__FILE__) + '/nikkei-paid')
+module News2Kindle
+	module Generator
+		class NikkeiFree < NikkeiPaid
+			def auth
+				return nil, nil
+			end
+		end
+	end
+end

data/lib/news2kindle/generator/nikkei-paid.rb ADDED Viewed

@@ -0,0 +1,352 @@
+# scraping nikkei.com (for paid user) for News2Kindle
+#
+require 'mechanize'
+require 'nokogiri'
+require 'open-uri'
+require 'tmpdir'
+require 'pathname'
+module News2Kindle
+	module Generator
+		class NikkeiPaid
+			class IllegalPage < StandardError; end
+			TOP = 'https://www.nikkei.com'
+			LOGIN = "#{TOP}/etc/accounts/login?dps=3&amp;pageflag=top&amp;url=http%3A%2F%2Fwww.nikkei.com%2F"
+			def initialize( tmpdir )
+				@nikkei_id, @nikkei_pw = auth
+				@current_dir = tmpdir
+				@src_dir = @current_dir + '/src'
+				Dir::mkdir( @src_dir )
+				@dst_dir = @current_dir + '/dst'
+				Dir::mkdir( @dst_dir )
+				FileUtils.cp( "./resource/nikkei.jpg", @dst_dir )
+				FileUtils.cp( "./resource/nikkei.css", @dst_dir )
+			end
+			def generate(opts)
+				@now = opts[:now]
+				@now_str = @now.strftime '%Y-%m-%d %H:%M'
+				agent = Mechanize::new
+				agent.set_proxy( *ENV['HTTP_PROXY'].split( /:/ ) ) if ENV['HTTP_PROXY']
+				toc = []
+				if @nikkei_id and @nikkei_pw
+					agent.get('https://regist.nikkei.com/ds/etc/accounts/logout')
+					agent.get( LOGIN )
+					agent.page.form_with( :name => 'autoPostForm' ).submit
+					agent.page.form_with( :name => 'LA7010Form01' ) do |form|
+						form['LA7010Form01:LA7010Email'] = @nikkei_id
+						form['LA7010Form01:LA7010Password'] = @nikkei_pw
+						form.click_button
+					end
+					agent.page.forms.first.submit
+				else
+					agent.get( TOP )
+				end
+				#
+				# scraping top news
+				#
+				toc_top = ['TOP NEWS']
+				%w(first second third fourth).each do |category|
+					(agent.page / "div.nx-top_news_#{category} h3 a").each do |a|
+						uri = a.attr('href')
+						next if News2Kindle::DupChecker.dup?(uri)
+						toc_top << [canonical( a.text.strip ), uri]
+					end
+				end
+				toc << toc_top
+				#
+				# scraping all categories
+				#
+				(agent.page / 'div.cmnc-genre').each do |genre|
+					toc_cat = []
+					(genre / 'h4.cmnc-genre_title a.cmnc-title_text').each do |cat|
+						next if /local/ =~ cat.attr( 'href' )
+						toc_cat << cat.text
+						(genre / 'li a').each do |article|
+							uri = article.attr('href')
+							next if News2Kindle::DupChecker.dup?(uri)
+							toc_cat << [canonical( article.text ), uri]
+						end
+					end
+					toc << toc_cat
+				end
+				begin
+					generate_contents( toc, agent )
+					yield "#{@dst_dir}/#{basename}.opf"
+				end
+				if @nikkei_id and @nikkei_pw
+					agent.get('https://regist.nikkei.com/ds/etc/accounts/logout')
+				end
+			end
+		private
+			def auth
+				require 'pit'
+				login = Pit::get('news2kindle', require: {
+					nikkei_user: 'your ID of Nikkei.',
+					nikkei_pass: 'your Password of Nikkei.',
+				})
+				return login[:nikkei_user], login[:nikkei_pass]
+			end
+			def basename
+				self.class.to_s.sub(/.*:/, '').gsub(/([A-Z])/, '-\\1').sub(/^-/, '').downcase
+			end
+			def canonical( str )
+				str.gsub( /\uFF5E/, "\u301C" ) # for WAVE DASH problem
+			end
+			def retry_loop( times )
+				count = 0
+				begin
+					yield
+				rescue
+					count += 1
+					if count >= times
+						raise
+					else
+						News2Kindle.logger.error $!
+						News2Kindle.logger.info "#{count} retry."
+						retry
+					end
+				end
+			end
+			def html_header( title )
+				<<~HTML
+					<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+					<html>
+					<head>
+						<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></meta>
+						<title>#{title}</title>
+						<link rel="stylesheet" href="nikkei.css" type="text/css" media="all"></link>
+					</head>
+					<body>
+						<h1>#{title}</h1>
+				HTML
+			end
+			def get_html_item( agent, uri, sub = nil )
+				uri.sub!( %r|^https://www.nikkei.com|, '' )
+				aid = uri2aid( uri )
+				html = nil
+				if File::exist?( "#{@src_dir}/#{aid}#{sub}.html" ) # loading cache
+					html = Nokogiri( open( "#{@src_dir}/#{aid}#{sub}.html", 'r:utf-8', &:read ) )
+				else
+					begin
+						#puts "getting html #{aid}#{sub}"
+						retry_loop( 5 ) do
+							agent.get( "#{TOP}#{uri}" )
+							html = agent.page.root
+							sleep 1
+						end
+					rescue
+						News2Kindle.logger.error "cannot get #{TOP}#{uri}."
+						raise
+					end
+					open( "#{@src_dir}/#{aid}#{sub}.html", 'w:utf-8' ) do |f|
+						f.write( html.to_html )
+					end
+				end
+				html
+			end
+			def scrape_html_item( html )
+				result = ''
+				(html / 'div.cmn-article_text').each do |div|
+					div.children.each do |e|
+					#div.css('div.cmn-photo_style2 img', 'p', 'table').each do |e|
+						case e.name
+						when 'p'
+							next unless (e / 'a.cmnc-continue').empty?
+							(e / 'span.JSID_urlData').remove
+							para = canonical e.text.strip.sub( /^　/, '' )
+							result << "\t<p>#{para}</p>" unless para.empty?
+						when 'table'
+							result << e.to_html
+						when 'div'
+							e.css('img').each do |img|
+								image_url = img['src']
+								next if /^http/ =~ image_url # skip images in other server
+								next if /^\/\// =~ image_url # skip assets
+								image_file = File::basename( image_url )
+								begin
+									image = open( "#{TOP}#{image_url.sub /PN/, 'PB'}", &:read )
+									open( "#{@dst_dir}/#{image_file}", 'w' ){|fp| fp.write image}
+									result << %Q|\t<div>|
+									result << %Q|\t\t<img src="#{image_file}">|
+									result << %Q|\t\t<p>[#{e.text}]</p>| unless e.text.strip.empty?
+									result << %Q|\t</div>|
+								rescue
+									News2Kindle.logger.debug $!
+									News2Kindle.logger.warn "FAIL TO DOWNLOAD IMAGE: #{image_url}"
+								end
+							end
+						end
+					end
+				end
+				result
+			end
+			def html_item( item, uri, agent )
+				aid = uri2aid( uri )
+				return '' unless aid
+				html = get_html_item( agent, uri )
+				out_file = "#{@dst_dir}/#{aid}.html"
+				begin
+					open( out_file, 'w:utf-8' ) do |f|
+						f.puts canonical( html_header( (html / 'h1.cmn-article_title, h4.cmn-article_title, h2.cmn-article_title')[0].text.strip ) )
+						f.puts scrape_html_item( html )
+						(html / 'div.cmn-article_nation ul li a').map {|link|
+							link.attr( 'href' )
+						}.sort.uniq.each_with_index do |link,index|
+							f.puts scrape_html_item( get_html_item( agent, link, index + 2 ) )
+						end
+						f.puts html_footer
+					end
+					%Q|\t\t<li><a href="#{aid}.html">#{item}</a></li>|
+				rescue NoMethodError
+					News2Kindle.logger.debug $!
+					News2Kindle.logger.error "page parsing faild. #{aid}"
+					File.delete out_file
+					raise IllegalPage.new
+				end
+			end
+			def html_footer
+				<<~HTML
+					</body>
+					</html>
+				HTML
+			end
+			def ncx_header
+				<<~XML
+					<?xml version="1.0" encoding="UTF-8"?>
+					<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
+					<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
+					<docTitle><text>日経電子版 (#{@now_str})</text></docTitle>
+					<navMap>
+						<navPoint id="toc" playOrder="0"><navLabel><text>Table of Contents</text></navLabel><content src="toc.html" /></navPoint>
+				XML
+			end
+			def ncx_item( item, uri, index )
+				aid = uri2aid( uri )
+				aid ? %Q|\t\t<navPoint id="#{aid}" playOrder="#{index}"><navLabel><text>#{item}</text></navLabel><content src="#{aid}.html" /></navPoint>| : ''
+			end
+			def ncx_footer
+				<<~XML
+					</navMap>
+					</ncx>
+				XML
+			end
+			def opf_header
+				<<~XML
+					<?xml version="1.0" encoding="utf-8"?>
+					<package unique-identifier="uid">
+						<metadata>
+							<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core" xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/">
+								<dc:Title>日経電子版 (#{@now_str})</dc:Title>
+								<dc:Language>ja-JP</dc:Language>
+								<dc:Creator>日本経済新聞社</dc:Creator>
+								<dc:Description>日経電子版、#{@now_str}生成</dc:Description>
+								<dc:Date>#{@now.strftime( '%d/%m/%Y' )}</dc:Date>
+							</dc-metadata>
+							<x-metadata>
+								<output encoding="utf-8" content-type="text/x-oeb1-document"></output>
+								<EmbeddedCover>nikkei.jpg</EmbeddedCover>
+							</x-metadata>
+						</metadata>
+						<manifest>
+							<item id="toc" media-type="application/x-dtbncx+xml" href="toc.ncx"></item>
+							<item id="style" media-type="text/css" href="nikkei.css"></item>
+							<item id="index" media-type="text/html" href="toc.html"></item>
+				XML
+			end
+			def opf_item( uri )
+				aid = uri2aid( uri )
+				aid ? %Q|\t\t<item id="#{aid}" media-type="text/html" href="#{aid}.html"></item>| : ''
+			end
+			def opf_footer( aids )
+				items = aids.map{|aid| %Q|\t<itemref idref="#{aid}" />|}
+				<<~XML
+					</manifest>
+					<spine toc="toc">
+						#{items.join("\n")}
+						<itemref idref="index" />
+					</spine>
+					<tours></tours>
+					<guide>
+					  <reference type="toc" title="Table of Contents" href="toc.html"></reference>
+					  <reference type="start" title="Top Story" href="#{aids[0]}.html"></reference>
+					</guide>
+					</package>
+				XML
+			end
+			def uri2aid( uri )
+				uri.scan( %r|/article/([^/]*)/| ).flatten[0]
+			end
+			def generate_contents( toc, agent )
+				open( "#{@dst_dir}/toc.html", 'w:utf-8' ) do |html|
+				open( "#{@dst_dir}/toc.ncx", 'w:utf-8' ) do |ncx|
+				open( "#{@dst_dir}/#{basename}.opf", 'w:utf-8' ) do |opf|
+					first = true
+					toc_index = 0
+					aids = []
+					ncx.puts ncx_header
+					opf.puts opf_header
+					toc.each do |category|
+						category.each do |article|
+							if article.class == String
+								html.puts first ?
+									html_header( 'Table of Contents' ) :
+									"\t</ul>\n\t<mbp:pagebreak />"
+								html.puts "\t<h2>#{article}</h2>"
+								html.puts "\t<ul>"
+								first = false
+							else
+								begin
+									html.puts html_item( article[0], article[1], agent )
+									ncx.puts ncx_item( article[0], article[1], toc_index += 1 )
+									unless aids.index( uri2aid( article[1] ) )
+										opf.puts opf_item( article[1] )
+										aids << uri2aid( article[1] ) if uri2aid( article[1] )
+									end
+								rescue IllegalPage
+								end
+							end
+						end
+					end
+					html.puts "\t</ul>"
+					html.puts html_footer
+					ncx.puts ncx_footer
+					opf.puts opf_footer( aids )
+				end
+				end
+				end
+			end
+		end
+	end
+end