RubyGems - aozoragen - Versions diffs - 0.0.6 → 0.1.0 - Mend

aozoragen 0.0.6 → 0.1.0

Files changed (7) hide show

data/bin/aozoragen +52 -18
data/lib/aozoragen/renzaburo.rb +41 -41
data/lib/aozoragen/sai-zen-sen.rb +95 -70
data/lib/aozoragen/util.rb +69 -0
data/lib/aozoragen/version.rb +1 -1
data/lib/aozoragen/webmysteries.rb +52 -48
metadata +5 -4

data/bin/aozoragen CHANGED Viewed

@@ -6,37 +6,71 @@
 # Copyright (C) 2012 by TADA Tadashi <t@tdtds.jp>
 # Distributed under GPL
 #
+require 'optparse'
 require 'uri'
 require 'pathname'
+opts = {
+	overwrite: false,
+	output: '.',
+	verbose: false,
+}
+OptionParser.new do |o|
+	o.banner = 'Usage: aozoragen [options] <URI...>'
+	o.on( '-f', '--overwrite', 'force overwrite existent files.' ){|b| opts[:overwrite] = b}
+	o.on( '-O DIR', '--output DIR', 'specify output directory.' ){|dir| opts[:output] = dir}
+	o.on( '-v', '--verbose', 'show progress messages.' ){|b| opts[:verbose] = b}
+	begin
+		o.parse!
+	rescue OptionParser::InvalidOption
+		puts "invalid option\n\n#{o}"
+		exit -1
+	end
+end
+def write_open( file, opts )
+	name = "#{opts[:output]}/#{file}"
+	if !opts[:overwrite] && FileTest::exist?( name )
+		puts "Skipping write #{file}." if opts[:verbose]
+		return
+	end
+	open( name, 'w' ){|w| yield w}
+	puts name if opts[:verbose]
+end
 ARGV.each do |u|
 	uri = URI( u )
-	book = nil
-	case uri.host
-	when 'sai-zen-sen.jp'
-		require 'aozoragen/sai-zen-sen'
-		book = SaiZenSen::new( uri )
-	when 'renzaburo.jp'
-		require 'aozoragen/renzaburo'
-		book = Renzaburo::new( uri )
-	when 'github.com'
-		case Pathname( uri.path ).basename.to_s.sub( %r|(.*?)-.*$|, '\1' )
-		when 'webmysteries'
-			require 'aozoragen/webmysteries'
-			book = Webmysteries::new( uri )
+	service = case uri.host
+		when 'sai-zen-sen.jp'
+			'sai-zen-sen'
+		when 'renzaburo.jp'
+			'renzaburo'
+		when 'github.com'
+			case Pathname( uri.path ).basename.to_s.sub( %r|(.*?)-.*$|, '\1' )
+			when 'webmysteries'
+				'webmisteries'
+			else
+				nil
+			end
+		else
+			nil
 		end
-	else
-		p "Error: unknown URI: #{uri}"
+	unless service
+		puts "Skipping unknown URI: #{uri}"
+		next
 	end
-	next unless book
+	require "aozoragen/#{service}"
+	book = (Aozoragen.const_get service.split(/-/).map{|s| s.capitalize}.join)::new( uri )
 	meta = book.metainfo
-	open( "#{meta[:id]}.00.txt", 'w' ) do |w|
+	write_open( "#{meta[:id]}.00.txt", opts ) do |w|
 		w.puts "#{meta[:title]}\n#{meta[:author].join ' / '}\n\n\n［＃改ページ］"
 	end
 	book.each_chapter do |chapter|
-		open( "#{meta[:id]}.#{chapter[:id]}.txt", 'w' ) do |w|
+		write_open( "#{meta[:id]}.#{chapter[:id]}.txt", opts ) do |w|
 			w.puts chapter[:text]
 		end
 	end

data/lib/aozoragen/renzaburo.rb CHANGED Viewed

@@ -2,55 +2,55 @@
 #
 # scraping renzaburo.jp
 #
-require 'nokogiri'
+require 'aozoragen/util'
 require 'open-uri'
 require 'pathname'
-class Renzaburo
-	def initialize( index_uri )
-		@index_uri = URI( index_uri.to_s.sub( /index\.html$/, '' ) )
-		@index_html = Nokogiri( open( @index_uri, 'r:CP932', &:read ) )
-	end
+module Aozoragen
+	class Renzaburo
+		include Util
-	def metainfo
-		info = {:id => Pathname( @index_uri.path ).basename.to_s}
-		(@index_html / 'title').each do |t|
-			info[:title] = t.text.sub( /｜.*/, '' )
-		end
-		(@index_html / 'div.textBlock strong' ).each do |st|
-			info[:author] = [st.text]
+		def initialize( index_uri )
+			@index_uri = URI( index_uri.to_s.sub( /index\.html$/, '' ) )
+			@index_html = Nokogiri( open( @index_uri, 'r:CP932', &:read ) )
 		end
-		info
-	end
-	def each_chapter
-		book_title = metainfo[:title]
-		(@index_html / 'ul.btnList li.withDate a' ).each do |a|
-			uri = @index_uri + a.attr( :href )
-			text = get_content( uri, book_title )
-			yield( {id: Pathname( uri.path ).dirname.basename.to_s, uri: uri, text: text} )
+		def metainfo
+			info = {:id => Pathname( @index_uri.path ).basename.to_s}
+			(@index_html / 'title').each do |t|
+				info[:title] = t.text.sub( /｜.*/, '' )
+			end
+			(@index_html / 'div.textBlock strong' ).each do |st|
+				info[:author] = [st.text]
+			end
+			info
 		end
-	end
-	def get_content( uri, book_title = '' )
-		text = ''
-		html = open( uri, 'r:CP932', &:read ).encode( 'UTF-8' )
-		html = html.gsub( /\&mdash;/, "\u2500" ).gsub( /\&quot;/, "\u201D" )
-		(Nokogiri( html ) / 'div#mainContent' ).each do |content|
-			(content / 'h3').each do |t|
-				title = t.text.sub( /^『#{book_title}』　/, '' )
-				text << "\n［＃小見出し］#{title}［＃小見出し終わり］\n\n"
+		def each_chapter
+			book_title = metainfo[:title]
+			(@index_html / 'ul.btnList li.withDate a' ).each do |a|
+				uri = @index_uri + a.attr( :href )
+				text = get_content( uri, book_title )
+				yield( {id: Pathname( uri.path ).dirname.basename.to_s, uri: uri, text: text} )
 			end
-			(content / 'div.textBlock p' ).each do |para|
-				next if /＜次回につづく＞/ =~ para.text
-				text << '［＃１０字下げ］' if (para.attr('class') || '').index( 'txtAlignC' )
-				text << para.text.gsub( /<br>/, "\n" ) << "\n\n"
+		end
+		def get_content( uri, book_title = '' )
+			text = ''
+			html = open( uri, 'r:CP932', &:read ).encode( 'UTF-8' )
+			html = html.gsub( /\&mdash;/, "\u2500" ).gsub( /\&quot;/, "\u201D" )
+			(Nokogiri( html ) / 'div#mainContent' ).each do |content|
+				(content / 'h3').each do |t|
+					text << t.text.sub( /^『#{book_title}』　/, '' ).subhead
+				end
+				(content / 'div.textBlock p' ).each do |para|
+					next if /＜次回につづく＞/ =~ para.text
+					text << '［＃１０字下げ］' if (para.attr('class') || '').index( 'txtAlignC' )
+					text << para.text.gsub( /<br>/, "\n" ) << "\n\n"
+				end
 			end
+			text << "［＃改ページ］\n"
+			text.for_tategaki
 		end
-		text << "［＃改ページ］\n"
-		text.gsub( /＜/, '〈' ).gsub( /＞/, '〉' ).tr(
-			'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789',
-			'ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ０１２３４５６７８９'
-		)
 	end
 end

data/lib/aozoragen/sai-zen-sen.rb CHANGED Viewed

@@ -2,37 +2,54 @@
 #
 # scraping sai-zen-sen.jp
 #
-require 'nokogiri'
+require 'aozoragen/util'
 require 'open-uri'
 require 'pathname'
-class SaiZenSen
-	def initialize( index_uri )
-		if index_uri.path == '/sa/fate-zero/works/'
-			@entity = SaiZenSenFateZero::new( index_uri )
-		else
-			@entity = SaiZenSenRegular::new( index_uri )
-		end
-	end
+module Aozoragen
+	class SaiZenSen
+		include Util
-	def metainfo
-		@entity.metainfo
-	end
-	def each_chapter
-		@entity.each_chapter{|c| yield c}
-	end
+		def initialize( index_uri )
+			case index_uri.path
+			when '/sa/fate-zero/works/'
+				@entity = SaiZenSenFateZero::new( index_uri )
+			when %r|/01.html$| # short story
+				@entity = SaiZenSenShort::new( index_uri )
+			else
+				@entity = SaiZenSenRegular::new( index_uri )
+			end
+		end
+		def metainfo
+			@entity.metainfo
+		end
+		def each_chapter
+			@entity.each_chapter do |c|
+				c[:text] = c[:text].normalize_char
+				yield c
+			end
+		end
+		def each_chapter_local( selector )
+			(@index_html / selector).each do |a|
+				uri = @index_uri + a.attr('href')
+				next if uri.path == '/entryguide.html' # skipping member only contents.
-	def each_chapter_local( selector )
-		(@index_html / selector).each do |a|
-			uri = @index_uri + a.attr('href')
-			chapter = Nokogiri( open( uri, 'r:utf-8', &:read ) )
+				chapter = Nokogiri( open( uri, 'r:utf-8', &:read ) )
+				text = get_chapter_text( chapter )
+				yield( {id: Pathname( uri.path ).dirname.basename.to_s, uri: uri, text: text} )
+			end
+		end
+		def get_chapter_text( chapter )
 			text = ''
 			(chapter / 'section.book-page-spread').each do |page|
 				page.children.each do |section|
 					case section.name
 					when 'hgroup'
-						text << "\n［＃小見出し］#{detag section}［＃小見出し終わり］\n\n"
+						text << detag( section ).subhead
 					when 'div'
 						case section.attr( 'class' )
 						when /delimiter/
@@ -54,61 +71,69 @@ class SaiZenSen
 				end
 				text << "［＃改ページ］\n"
 			end
-			yield( {id: Pathname( uri.path ).dirname.basename.to_s, uri: uri, text: text} )
+			text
 		end
 	end
-	def detag( elem )
-		(elem / 'ruby rp').each do |rp|
-			case rp.text
-			when '（'
-				rp.inner_html = '《'
-			when '）'
-				rp.inner_html = '》'
+	class SaiZenSenRegular < SaiZenSen
+		def initialize( index_uri )
+			@index_uri = index_uri
+			@index_html = Nokogiri( open( @index_uri, 'r:utf-8', &:read ) )
+		end
+		def metainfo
+			info = {:id => Pathname( @index_uri.path ).basename.to_s, :author => []}
+			info[:title] = (@index_html / '#page-content-heading h1')[0].text
+			(@index_html / '#authors h3').each do |author|
+				info[:author] << author.text.sub( /.*? /, '' )
 			end
+			info
 		end
-		elem.to_html.gsub( /<.*?>/, '' ).strip
-	end
-end
-class SaiZenSenRegular < SaiZenSen
-	def initialize( index_uri )
-		@index_uri = index_uri
-		@index_html = Nokogiri( open( @index_uri, 'r:utf-8', &:read ) )
-	end
-	def metainfo
-		info = {:id => Pathname( @index_uri.path ).basename.to_s, :author => []}
-		info[:title] = (@index_html / '#page-content-heading h1')[0].text
-		(@index_html / '#authors h3').each do |author|
-			info[:author] << author.text.sub( /.*? /, '' )
+		def each_chapter
+			each_chapter_local( '#back-numbers li a' ){|c| yield c}
 		end
-		info
 	end
-	def each_chapter
-		each_chapter_local( '#back-numbers li a' ){|c| yield c}
-	end
-end
-class SaiZenSenFateZero < SaiZenSen
-	def initialize( index_uri )
-		@index_uri = index_uri
-		@index_html = Nokogiri( open( @index_uri, 'r:utf-8', &:read ) )
-	end
-	def metainfo
-		info = {
-			:id => Pathname( @index_uri.path ).basename.to_s,
-			:author => ['虚淵玄']
-		}
-		info[:title] = (@index_html / 'h1 img')[0].attr( 'alt' )
-		info
+	class SaiZenSenShort < SaiZenSen
+		def initialize( index_uri )
+			@index_uri = index_uri
+			@index_html = Nokogiri( open( @index_uri, 'r:utf-8', &:read ) )
+		end
+		def metainfo
+			info = {:id => Pathname( @index_uri.path ).dirname.dirname.basename.to_s, :author => []}
+			info[:title] = (@index_html / 'h1.book-title')[0].text
+			(@index_html / 'h2.book-author strong').each do |author|
+				info[:author] << author.text
+			end
+			info
+		end
+		def each_chapter
+			text = get_chapter_text( @index_html )
+			yield( {id: Pathname( @index_uri.path ).basename( '.html' ).to_s, uri: @index_uri, text: text} )
+		end
 	end
-	def each_chapter
-		each_chapter_local( 'article a' ){|c| yield c}
+	class SaiZenSenFateZero < SaiZenSen
+		def initialize( index_uri )
+			@index_uri = index_uri
+			@index_html = Nokogiri( open( @index_uri, 'r:utf-8', &:read ) )
+		end
+		def metainfo
+			info = {
+				:id => Pathname( @index_uri.path ).basename.to_s,
+				:author => ['虚淵玄']
+			}
+			info[:title] = (@index_html / 'h1 img')[0].attr( 'alt' )
+			info
+		end
+		def each_chapter
+			each_chapter_local( 'article a' ){|c| yield c}
+		end
 	end
 end

data/lib/aozoragen/util.rb ADDED Viewed

@@ -0,0 +1,69 @@
+# -*- coding: utf-8; -*-
+#
+# utility methods for converting to AOZORA format
+#
+require 'nokogiri'
+##
+# Enhanced String methods for converting to AOZORA format
+#
+class String
+	##
+	# Half width of Alphabet and Digit to Full width.
+	#
+	def han2zen
+		self.tr( 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789',
+			'ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ０１２３４５６７８９' )
+	end
+	##
+	# replace characters fitting to vertical lyout
+	#
+	def for_tategaki
+		self.tr( '＜＞‐', '〈〉─' ).han2zen
+	end
+	##
+	# section heading format of Aozora
+	#
+	def subhead
+		self.split( /\n/ ).map{|x|
+			"\n［＃小見出し］#{x}［＃小見出し終わり］"
+		}.join + "\n\n"
+	end
+	##
+	# normalize invalid charcters
+	#
+	def normalize_char
+		self.tr( "\u6451\u5653\u7e6b\uFF0D/", '掴嘘繋─' )
+	end
+end
+module Aozoragen
+	##
+	# Utility methods for Aozora format
+	#
+	module Util
+		##
+		# delete HTML tags
+		#
+		def detag( elem )
+			# ruby tags
+			(elem / 'ruby rp').each do |rp|
+				case rp.text
+				when '（'
+					rp.inner_html = '《'
+				when '）'
+					rp.inner_html = '》'
+				end
+			end
+			# delete tgas
+			elem.to_html.
+				gsub( /<br>/, "\n" ).
+				gsub( /<.*?>/, '' ).
+				strip
+		end
+	end
+end

data/lib/aozoragen/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Aozoragen
-  VERSION = "0.0.6"
+  VERSION = "0.1.0"
 end

data/lib/aozoragen/webmysteries.rb CHANGED Viewed

@@ -2,67 +2,71 @@
 #
 # scraping webmysteries.jp
 #
-require 'nokogiri'
+require 'aozoragen/util'
 require 'open-uri'
 require 'pathname'
 require 'cgi'
-class Webmysteries
-	def initialize( index_uri )
-		@index_uri = URI( index_uri )
-		@index_html = Nokogiri( open( @index_uri, 'r:UTF-8', &:read ) )
-	end
+module Aozoragen
+	class Webmysteries
+		include Util
-	def metainfo
-		info = {:id => Pathname( @index_uri.path ).basename.sub( %r|.*?-(.*)$|, '\1' ).to_s}
-		info[:title] = (@index_html / '#wiki-body h1')[0].text
-		(@index_html / '#wiki-body h2 + ul li' ).each do |li|
-			info[:author] = [li.text]
+		def initialize( index_uri )
+			@index_uri = URI( index_uri )
+			@index_html = Nokogiri( open( @index_uri, 'r:UTF-8', &:read ) )
 		end
-		info
-	end
-	def each_chapter
-		(@index_html / '#wiki-body h3 + ul li a' ).each_with_index do |a, i|
-			uri = URI( a.attr( 'href' ) )
-			text = "\n"
-			each_pages( uri ) do |page|
-				text << page
+		def metainfo
+			info = {:id => Pathname( @index_uri.path ).basename.sub( %r|.*?-(.*)$|, '\1' ).to_s}
+			info[:title] = (@index_html / '#wiki-body h1')[0].text
+			(@index_html / '#wiki-body h2 + ul li' ).each do |li|
+				info[:author] = [li.text]
 			end
-			yield( {id: '%02d' % (i+1), uri: uri, text: text} )
+			info
 		end
-	end
-	def each_pages( index )
-		begin
-			pages = []
-			html = Nokogiri( open( index, 'r', &:read ) )
-			(html / 'ul.pageNavi a').each do |a|
-				pages << a.attr( 'href' )
+		def each_chapter
+			(@index_html / '#wiki-body h3 + ul li a' ).each_with_index do |a, i|
+				uri = URI( a.attr( 'href' ) )
+				text = "\n"
+				each_pages( uri ) do |page|
+					text << page
+				end
+				yield( {id: '%02d' % (i+1), uri: uri, text: text} )
 			end
-			pages.shift	# delete current page
+		end
+		def each_pages( index )
 			begin
-				(html / 'noscript param[name="FlashVars"]')[0].attr( 'value' ).scan( /entry=(\d+)/ ) do |i|
-					yield get_text( i[0] )
+				pages = []
+				html = Nokogiri( open( index, 'r', &:read ) )
+				(html / 'ul.pageNavi a').each do |a|
+					pages << a.attr( 'href' )
 				end
-			end while html = Nokogiri( open( pages.shift, 'r', &:read ) )
-		rescue TypeError
-			# ignore open nil
+				pages.shift	# delete current page
+				begin
+					(html / 'noscript param[name="FlashVars"]')[0].attr( 'value' ).scan( /entry=(\d+)/ ) do |i|
+						yield get_text( i[0] )
+					end
+				end while html = Nokogiri( open( pages.shift, 'r', &:read ) )
+			rescue TypeError
+				# ignore open nil
+			end
 		end
-	end
-	def get_text( xml_id )
-		result = ''
-		open( "http://www.webmysteries.jp/entry_xml_data/#{xml_id}.xml" ) do |fx|
-			CGI::unescape( fx.read ).scan( %r|<entryBody>(.*?)</entryBody>|m ) do |entry|
-				result << entry[0].gsub( %r|<.*?>|m, "" )
+		def get_text( xml_id )
+			result = ''
+			open( "http://www.webmysteries.jp/entry_xml_data/#{xml_id}.xml" ) do |fx|
+				CGI::unescape( fx.read ).scan( %r|<entryBody>(.*?)</entryBody>|m ) do |entry|
+					result << entry[0].gsub( %r|<.*?>|m, "" )
+				end
 			end
+			result.
+				gsub( /^.*（つづく）.*$/, '［＃改ページ］' ).
+				gsub( /(?<=.)（([あ-ん]+)）/, '《\1》' ).
+				gsub( /\n{3,}/m, "\n\n" ).
+				for_tategaki.
+				normalize_char
 		end
-		result.
-			gsub( /^.*（つづく）.*$/, '［＃改ページ］' ).
-			gsub( /(?<=.)（([あ-ん]+)）/, '《\1》' ).
-			gsub( /‐/, '─' ).
-			gsub( /\uFF0D/, '─' ).
-			gsub( /\n{3,}/m, "\n\n" )
 	end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: aozoragen
 version: !ruby/object:Gem::Version
-  version: 0.0.6
+  version: 0.1.0
   prerelease:
 platform: ruby
 authors:
@@ -9,11 +9,11 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-02-27 00:00:00.000000000 Z
+date: 2012-02-29 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
-  requirement: &81978730 !ruby/object:Gem::Requirement
+  requirement: &72469120 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *81978730
+  version_requirements: *72469120
 description: Scraping some Ebook web site and generating AOZORA format text files.
 email:
 - t@tdtds.jp
@@ -41,6 +41,7 @@ files:
 - lib/aozoragen.rb
 - lib/aozoragen/renzaburo.rb
 - lib/aozoragen/sai-zen-sen.rb
+- lib/aozoragen/util.rb
 - lib/aozoragen/version.rb
 - lib/aozoragen/webmysteries.rb
 homepage: https://github.com/tdtds/aozoragen