RubyGems - aozoragen - Versions diffs - 0.0.6 → 0.1.0 - Mend

aozoragen 0.0.6 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/bin/aozoragen +52 -18
data/lib/aozoragen/renzaburo.rb +41 -41
data/lib/aozoragen/sai-zen-sen.rb +95 -70
data/lib/aozoragen/util.rb +69 -0
data/lib/aozoragen/version.rb +1 -1
data/lib/aozoragen/webmysteries.rb +52 -48
metadata +5 -4

data/bin/aozoragen CHANGED Viewed

@@ -6,37 +6,71 @@
 # Copyright (C) 2012 by TADA Tadashi <t@tdtds.jp>
 # Distributed under GPL
 #
+require 'optparse'
 require 'uri'
 require 'pathname'
+opts = {
+	overwrite: false,
+	output: '.',
+	verbose: false,
+}
+OptionParser.new do |o|
+	o.banner = 'Usage: aozoragen [options] <URI...>'
+	o.on( '-f', '--overwrite', 'force overwrite existent files.' ){|b| opts[:overwrite] = b}
+	o.on( '-O DIR', '--output DIR', 'specify output directory.' ){|dir| opts[:output] = dir}
+	o.on( '-v', '--verbose', 'show progress messages.' ){|b| opts[:verbose] = b}
+	begin
+		o.parse!
+	rescue OptionParser::InvalidOption
+		puts "invalid option\n\n#{o}"
+		exit -1
+	end
+end
+def write_open( file, opts )
+	name = "#{opts[:output]}/#{file}"
+	if !opts[:overwrite] && FileTest::exist?( name )
+		puts "Skipping write #{file}." if opts[:verbose]
+		return
+	end
+	open( name, 'w' ){|w| yield w}
+	puts name if opts[:verbose]
+end
 ARGV.each do |u|
 	uri = URI( u )
-	book = nil
-	case uri.host
-	when 'sai-zen-sen.jp'
-		require 'aozoragen/sai-zen-sen'
-		book = SaiZenSen::new( uri )
-	when 'renzaburo.jp'
-		require 'aozoragen/renzaburo'
-		book = Renzaburo::new( uri )
-	when 'github.com'
-		case Pathname( uri.path ).basename.to_s.sub( %r|(.*?)-.*$|, '\1' )
-		when 'webmysteries'
-			require 'aozoragen/webmysteries'
-			book = Webmysteries::new( uri )
+	service = case uri.host
+		when 'sai-zen-sen.jp'
+			'sai-zen-sen'
+		when 'renzaburo.jp'
+			'renzaburo'
+		when 'github.com'
+			case Pathname( uri.path ).basename.to_s.sub( %r|(.*?)-.*$|, '\1' )
+			when 'webmysteries'
+				'webmisteries'
+			else
+				nil
+			end
+		else
+			nil
 		end
-	else
-		p "Error: unknown URI: #{uri}"
+	unless service
+		puts "Skipping unknown URI: #{uri}"
+		next
 	end
-	next unless book
+	require "aozoragen/#{service}"
+	book = (Aozoragen.const_get service.split(/-/).map{|s| s.capitalize}.join)::new( uri )
 	meta = book.metainfo
-	open( "#{meta[:id]}.00.txt", 'w' ) do |w|
+	write_open( "#{meta[:id]}.00.txt", opts ) do |w|
 		w.puts "#{meta[:title]}\n#{meta[:author].join ' / '}\n\n\n［＃改ページ］"
 	end
 	book.each_chapter do |chapter|
-		open( "#{meta[:id]}.#{chapter[:id]}.txt", 'w' ) do |w|
+		write_open( "#{meta[:id]}.#{chapter[:id]}.txt", opts ) do |w|
 			w.puts chapter[:text]
 		end
 	end

data/lib/aozoragen/renzaburo.rb CHANGED Viewed

@@ -2,55 +2,55 @@
 #
 # scraping renzaburo.jp
 #
-require 'nokogiri'
+require 'aozoragen/util'
 require 'open-uri'
 require 'pathname'
-class Renzaburo
-	def initialize( index_uri )
-		@index_uri = URI( index_uri.to_s.sub( /index\.html$/, '' ) )
-		@index_html = Nokogiri( open( @index_uri, 'r:CP932', &:read ) )
-	end
+module Aozoragen
+	class Renzaburo
+		include Util
-	def metainfo
-		info = {:id => Pathname( @index_uri.path ).basename.to_s}
-		(@index_html / 'title').each do |t|
-			info[:title] = t.text.sub( /｜.*/, '' )
-		end
-		(@index_html / 'div.textBlock strong' ).each do |st|
-			info[:author] = [st.text]
+		def initialize( index_uri )
+			@index_uri = URI( index_uri.to_s.sub( /index\.html$/, '' ) )
+			@index_html = Nokogiri( open( @index_uri, 'r:CP932', &:read ) )
 		end
-		info
-	end
-	def each_chapter
-		book_title = metainfo[:title]
-		(@index_html / 'ul.btnList li.withDate a' ).each do |a|
-			uri = @index_uri + a.attr( :href )
-			text = get_content( uri, book_title )
-			yield( {id: Pathname( uri.path ).dirname.basename.to_s, uri: uri, text: text} )
+		def metainfo
+			info = {:id => Pathname( @index_uri.path ).basename.to_s}
+			(@index_html / 'title').each do |t|
+				info[:title] = t.text.sub( /｜.*/, '' )
+			end
+			(@index_html / 'div.textBlock strong' ).each do |st|
+				info[:author] = [st.text]
+			end
+			info
 		end
-	end
-	def get_content( uri, book_title = '' )
-		text = ''
-		html = open( uri, 'r:CP932', &:read ).encode( 'UTF-8' )
-		html = html.gsub( /\&mdash;/, "\u2500" ).gsub( /\&quot;/, "\u201D" )
-		(Nokogiri( html ) / 'div#mainContent' ).each do |content|
-			(content / 'h3').each do |t|
-				title = t.text.sub( /^『#{book_title}』　/, '' )
-				text << "\n［＃小見出し］#{title}［＃小見出し終わり］\n\n"
+		def each_chapter
+			book_title = metainfo[:title]
+			(@index_html / 'ul.btnList li.withDate a' ).each do |a|
+				uri = @index_uri + a.attr( :href )
+				text = get_content( uri, book_title )
+				yield( {id: Pathname( uri.path ).dirname.basename.to_s, uri: uri, text: text} )
 			end
-			(content / 'div.textBlock p' ).each do |para|
-				next if /＜次回につづく＞/ =~ para.text
-				text << '［＃１０字下げ］' if (para.attr('class') || '').index( 'txtAlignC' )
-				text << para.text.gsub( /<br>/, "\n" ) << "\n\n"
+		end
+		def get_content( uri, book_title = '' )
+			text = ''
+			html = open( uri, 'r:CP932', &:read ).encode( 'UTF-8' )
+			html = html.gsub( /\&mdash;/, "\u2500" ).gsub( /\&quot;/, "\u201D" )
+			(Nokogiri( html ) / 'div#mainContent' ).each do |content|
+				(content / 'h3').each do |t|
+					text << t.text.sub( /^『#{book_title}』　/, '' ).subhead
+				end
+				(content / 'div.textBlock p' ).each do |para|
+					next if /＜次回につづく＞/ =~ para.text
+					text << '［＃１０字下げ］' if (para.attr('class') || '').index( 'txtAlignC' )
+					text << para.text.gsub( /<br>/, "\n" ) << "\n\n"
+				end
 			end
+			text << "［＃改ページ］\n"
+			text.for_tategaki
 		end
-		text << "［＃改ページ］\n"
-		text.gsub( /＜/, '〈' ).gsub( /＞/, '〉' ).tr(
-			'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789',
-			'ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ０１２３４５６７８９'
-		)
 	end
 end

data/lib/aozoragen/sai-zen-sen.rb CHANGED Viewed

@@ -2,37 +2,54 @@
 #
 # scraping sai-zen-sen.jp
 #
-require 'nokogiri'
+require 'aozoragen/util'
 require 'open-uri'
 require 'pathname'
-class SaiZenSen
-	def initialize( index_uri )
-		if index_uri.path == '/sa/fate-zero/works/'
-			@entity = SaiZenSenFateZero::new( index_uri )
-		else
-			@entity = SaiZenSenRegular::new( index_uri )
-		end
-	end
+module Aozoragen
+	class SaiZenSen
+		include Util
-	def metainfo
-		@entity.metainfo
-	end
-	def each_chapter
-		@entity.each_chapter{|c| yield c}
-	end
+		def initialize( index_uri )
+			case index_uri.path
+			when '/sa/fate-zero/works/'
+				@entity = SaiZenSenFateZero::new( index_uri )
+			when %r|/01.html$| # short story
+				@entity = SaiZenSenShort::new( index_uri )
+			else
+				@entity = SaiZenSenRegular::new( index_uri )
+			end
+		end
+		def metainfo
+			@entity.metainfo
+		end
+		def each_chapter
+			@entity.each_chapter do |c|
+				c[:text] = c[:text].normalize_char
+				yield c
+			end
+		end
+		def each_chapter_local( selector )
+			(@index_html / selector).each do |a|
+				uri = @index_uri + a.attr('href')
+				next if uri.path == '/entryguide.html' # skipping member only contents.
-	def each_chapter_local( selector )
-		(@index_html / selector).each do |a|
-			uri = @index_uri + a.attr('href')
-			chapter = Nokogiri( open( uri, 'r:utf-8', &:read ) )
+				chapter = Nokogiri( open( uri, 'r:utf-8', &:read ) )
+				text = get_chapter_text( chapter )
+				yield( {id: Pathname( uri.path ).dirname.basename.to_s, uri: uri, text: text} )
+			end
+		end
+		def get_chapter_text( chapter )
 			text = ''
 			(chapter / 'section.book-page-spread').each do |page|
 				page.children.each do |section|
 					case section.name
 					when 'hgroup'
-						text << "\n［＃小見出し］#{detag section}［＃小見出し終わり］\n\n"
+						text << detag( section ).subhead
 					when 'div'
 						case section.attr( 'class' )
 						when /delimiter/
@@ -54,61 +71,69 @@ class SaiZenSen
 				end
 				text << "［＃改ページ］\n"
 			end
-			yield( {id: Pathname( uri.path ).dirname.basename.to_s, uri: uri, text: text} )
+			text
 		end
 	end
-	def detag( elem )
-		(elem / 'ruby rp').each do |rp|
-			case rp.text
-			when '（'
-				rp.inner_html = '《'
-			when '）'
-				rp.inner_html = '》'
+	class SaiZenSenRegular < SaiZenSen
+		def initialize( index_uri )
+			@index_uri = index_uri
+			@index_html = Nokogiri( open( @index_uri, 'r:utf-8', &:read ) )
+		end
+		def metainfo
+			info = {:id => Pathname( @index_uri.path ).basename.to_s, :author => []}
+			info[:title] = (@index_html / '#page-content-heading h1')[0].text
+			(@index_html / '#authors h3').each do |author|
+				info[:author] << author.text.sub( /.*? /, '' )
 			end
+			info
 		end
-		elem.to_html.gsub( /<.*?>/, '' ).strip
-	end
-end
-class SaiZenSenRegular < SaiZenSen
-	def initialize( index_uri )
-		@index_uri = index_uri
-		@index_html = Nokogiri( open( @index_uri, 'r:utf-8', &:read ) )
-	end
-	def metainfo
-		info = {:id => Pathname( @index_uri.path ).basename.to_s, :author => []}
-		info[:title] = (@index_html / '#page-content-heading h1')[0].text
-		(@index_html / '#authors h3').each do |author|
-			info[:author] << author.text.sub( /.*? /, '' )
+		def each_chapter
+			each_chapter_local( '#back-numbers li a' ){|c| yield c}
 		end
-		info
 	end
-	def each_chapter
-		each_chapter_local( '#back-numbers li a' ){|c| yield c}
-	end
-end
-class SaiZenSenFateZero < SaiZenSen
-	def initialize( index_uri )
-		@index_uri = index_uri
-		@index_html = Nokogiri( open( @index_uri, 'r:utf-8', &:read ) )
-	end
-	def metainfo
-		info = {
-			:id => Pathname( @index_uri.path ).basename.to_s,
-			:author => ['虚淵玄']
-		}
-		info[:title] = (@index_html / 'h1 img')[0].attr( 'alt' )
-		info
+	class SaiZenSenShort < SaiZenSen
+		def initialize( index_uri )
+			@index_uri = index_uri
+			@index_html = Nokogiri( open( @index_uri, 'r:utf-8', &:read ) )
+		end
+		def metainfo
+			info = {:id => Pathname( @index_uri.path ).dirname.dirname.basename.to_s, :author => []}
+			info[:title] = (@index_html / 'h1.book-title')[0].text
+			(@index_html / 'h2.book-author strong').each do |author|
+				info[:author] << author.text
+			end
+			info
+		end
+		def each_chapter
+			text = get_chapter_text( @index_html )
+			yield( {id: Pathname( @index_uri.path ).basename( '.html' ).to_s, uri: @index_uri, text: text} )
+		end
 	end
-	def each_chapter
-		each_chapter_local( 'article a' ){|c| yield c}
+	class SaiZenSenFateZero < SaiZenSen
+		def initialize( index_uri )
+			@index_uri = index_uri
+			@index_html = Nokogiri( open( @index_uri, 'r:utf-8', &:read ) )
+		end
+		def metainfo
+			info = {
+				:id => Pathname( @index_uri.path ).basename.to_s,
+				:author => ['虚淵玄']
+			}
+			info[:title] = (@index_html / 'h1 img')[0].attr( 'alt' )
+			info
+		end
+		def each_chapter
+			each_chapter_local( 'article a' ){|c| yield c}
+		end
 	end
 end

data/lib/aozoragen/util.rb ADDED Viewed

@@ -0,0 +1,69 @@
+# -*- coding: utf-8; -*-
+#
+# utility methods for converting to AOZORA format
+#
+require 'nokogiri'
+##
+# Enhanced String methods for converting to AOZORA format
+#
+class String
+	##
+	# Half width of Alphabet and Digit to Full width.
+	#
+	def han2zen
+		self.tr( 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789',
+			'ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ０１２３４５６７８９' )
+	end
+	##
+	# replace characters fitting to vertical lyout
+	#
+	def for_tategaki
+		self.tr( '＜＞‐', '〈〉─' ).han2zen
+	end
+	##
+	# section heading format of Aozora
+	#
+	def subhead
+		self.split( /\n/ ).map{|x|
+			"\n［＃小見出し］#{x}［＃小見出し終わり］"
+		}.join + "\n\n"
+	end
+	##
+	# normalize invalid charcters
+	#
+	def normalize_char
+		self.tr( "\u6451\u5653\u7e6b\uFF0D/", '掴嘘繋─' )
+	end
+end
+module Aozoragen
+	##
+	# Utility methods for Aozora format
+	#
+	module Util
+		##
+		# delete HTML tags
+		#
+		def detag( elem )
+			# ruby tags
+			(elem / 'ruby rp').each do |rp|
+				case rp.text
+				when '（'
+					rp.inner_html = '《'
+				when '）'
+					rp.inner_html = '》'
+				end
+			end
+			# delete tgas
+			elem.to_html.
+				gsub( /<br>/, "\n" ).
+				gsub( /<.*?>/, '' ).
+				strip
+		end
+	end
+end

data/lib/aozoragen/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Aozoragen
-  VERSION = "0.0.6"
+  VERSION = "0.1.0"
 end

data/lib/aozoragen/webmysteries.rb CHANGED Viewed

@@ -2,67 +2,71 @@
 #
 # scraping webmysteries.jp
 #
-require 'nokogiri'
+require 'aozoragen/util'
 require 'open-uri'
 require 'pathname'
 require 'cgi'
-class Webmysteries
-	def initialize( index_uri )
-		@index_uri = URI( index_uri )
-		@index_html = Nokogiri( open( @index_uri, 'r:UTF-8', &:read ) )
-	end
+module Aozoragen
+	class Webmysteries
+		include Util
-	def metainfo
-		info = {:id => Pathname( @index_uri.path ).basename.sub( %r|.*?-(.*)$|, '\1' ).to_s}
-		info[:title] = (@index_html / '#wiki-body h1')[0].text
-		(@index_html / '#wiki-body h2 + ul li' ).each do |li|
-			info[:author] = [li.text]
+		def initialize( index_uri )
+			@index_uri = URI( index_uri )
+			@index_html = Nokogiri( open( @index_uri, 'r:UTF-8', &:read ) )
 		end
-		info
-	end
-	def each_chapter
-		(@index_html / '#wiki-body h3 + ul li a' ).each_with_index do |a, i|
-			uri = URI( a.attr( 'href' ) )
-			text = "\n"
-			each_pages( uri ) do |page|
-				text << page
+		def metainfo
+			info = {:id => Pathname( @index_uri.path ).basename.sub( %r|.*?-(.*)$|, '\1' ).to_s}
+			info[:title] = (@index_html / '#wiki-body h1')[0].text
+			(@index_html / '#wiki-body h2 + ul li' ).each do |li|
+				info[:author] = [li.text]
 			end
-			yield( {id: '%02d' % (i+1), uri: uri, text: text} )
+			info
 		end
-	end
-	def each_pages( index )
-		begin
-			pages = []
-			html = Nokogiri( open( index, 'r', &:read ) )
-			(html / 'ul.pageNavi a').each do |a|
-				pages << a.attr( 'href' )
+		def each_chapter
+			(@index_html / '#wiki-body h3 + ul li a' ).each_with_index do |a, i|
+				uri = URI( a.attr( 'href' ) )
+				text = "\n"
+				each_pages( uri ) do |page|
+					text << page
+				end
+				yield( {id: '%02d' % (i+1), uri: uri, text: text} )
 			end
-			pages.shift	# delete current page
+		end
+		def each_pages( index )
 			begin
-				(html / 'noscript param[name="FlashVars"]')[0].attr( 'value' ).scan( /entry=(\d+)/ ) do |i|
-					yield get_text( i[0] )
+				pages = []
+				html = Nokogiri( open( index, 'r', &:read ) )
+				(html / 'ul.pageNavi a').each do |a|
+					pages << a.attr( 'href' )
 				end
-			end while html = Nokogiri( open( pages.shift, 'r', &:read ) )
-		rescue TypeError
-			# ignore open nil
+				pages.shift	# delete current page
+				begin
+					(html / 'noscript param[name="FlashVars"]')[0].attr( 'value' ).scan( /entry=(\d+)/ ) do |i|
+						yield get_text( i[0] )
+					end
+				end while html = Nokogiri( open( pages.shift, 'r', &:read ) )
+			rescue TypeError
+				# ignore open nil
+			end
 		end
-	end
-	def get_text( xml_id )
-		result = ''
-		open( "http://www.webmysteries.jp/entry_xml_data/#{xml_id}.xml" ) do |fx|
-			CGI::unescape( fx.read ).scan( %r|<entryBody>(.*?)</entryBody>|m ) do |entry|
-				result << entry[0].gsub( %r|<.*?>|m, "" )
+		def get_text( xml_id )
+			result = ''
+			open( "http://www.webmysteries.jp/entry_xml_data/#{xml_id}.xml" ) do |fx|
+				CGI::unescape( fx.read ).scan( %r|<entryBody>(.*?)</entryBody>|m ) do |entry|
+					result << entry[0].gsub( %r|<.*?>|m, "" )
+				end
 			end
+			result.
+				gsub( /^.*（つづく）.*$/, '［＃改ページ］' ).
+				gsub( /(?<=.)（([あ-ん]+)）/, '《\1》' ).
+				gsub( /\n{3,}/m, "\n\n" ).
+				for_tategaki.
+				normalize_char
 		end
-		result.
-			gsub( /^.*（つづく）.*$/, '［＃改ページ］' ).
-			gsub( /(?<=.)（([あ-ん]+)）/, '《\1》' ).
-			gsub( /‐/, '─' ).
-			gsub( /\uFF0D/, '─' ).
-			gsub( /\n{3,}/m, "\n\n" )
 	end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: aozoragen
 version: !ruby/object:Gem::Version
-  version: 0.0.6
+  version: 0.1.0
   prerelease:
 platform: ruby
 authors:
@@ -9,11 +9,11 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-02-27 00:00:00.000000000 Z
+date: 2012-02-29 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
-  requirement: &81978730 !ruby/object:Gem::Requirement
+  requirement: &72469120 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *81978730
+  version_requirements: *72469120
 description: Scraping some Ebook web site and generating AOZORA format text files.
 email:
 - t@tdtds.jp
@@ -41,6 +41,7 @@ files:
 - lib/aozoragen.rb
 - lib/aozoragen/renzaburo.rb
 - lib/aozoragen/sai-zen-sen.rb
+- lib/aozoragen/util.rb
 - lib/aozoragen/version.rb
 - lib/aozoragen/webmysteries.rb
 homepage: https://github.com/tdtds/aozoragen