RubyGems - aozoragen - Versions diffs - 0.0.4 → 0.0.6 - Mend

aozoragen 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

data/.gitignore +1 -0
data/README.md +6 -1
data/bin/aozoragen +7 -0
data/lib/aozoragen/renzaburo.rb +6 -3
data/lib/aozoragen/sai-zen-sen.rb +2 -2
data/lib/aozoragen/version.rb +1 -1
data/lib/aozoragen/webmysteries.rb +68 -0
metadata +5 -4

data/.gitignore CHANGED

@@ -2,5 +2,6 @@
 .bundle
 Gemfile.lock
 pkg/*
+vendor/*
 *.txt
 *.pdf

data/README.md CHANGED

@@ -12,7 +12,7 @@ Gemを使ってインストールする:
     % aozoragen <URL>
-`URL`には日本語の小説をHTML形式で配布しているサイトの目次ページを指定する。カレントディレクトリに章ごとのテキストファイル(拡張子.txt)を生成する。ファイル名はサイトごとに自動的に決定され、`hoge.NN.txt` (NNは数値)のような形式となる。これらのファイルを連結すると一冊の本になる。
+`URL`には日本語の小説をHTML形式で配布しているサイトの目次ページを指定する。カレントディレクトリに章ごとのテキストファイル(拡張子.txt)を生成する。ファイル名はサイトごとに自動的に決定され、`hoge.NN.txt` (NNは連番数値またはその他の文字列)のような形式となる。これらのファイルを連結すると一冊の本になる。
 `aozoragen`コマンドが現在対応しているのは以下のサイト:
@@ -21,6 +21,11 @@ Gemを使ってインストールする:
  * 実行時に無償公開中の章のみが抽出される。
 * レンザブロー <http://renzaburo.jp/>
  * 指定例: http://renzaburo.jp/contents_t/061-katano/index.html
+* Webミステリーズ! <http://www.webmysteries.jp/>
+ * Webミステリーズ!の掲載作品には目次ページがないため、GitHubのWikiで代用する。
+ * h1要素に書名、h2要素に続くリストで著者名、h3要素に続くリストで連載各回のURLを表現する。
+ * WikiページのURLは、ファイル名が「webmisteries-」で始まるようにする。
+ * 指定例: https://github.com/tdtds/aozoragen/wiki/webmysteries-mm9_destruction
 ### aozora2pdf
 [青空キンドル](http://a2k.aill.org/)を使ってテキストをKindle向けPDFにする。パラメタにはaozoragenで生成した青空文庫形式のテキストファイルを順番通りに指定する。PDFは標準出力に出るので、リダイレクトする:

data/bin/aozoragen CHANGED

@@ -7,6 +7,7 @@
 # Distributed under GPL
 #
 require 'uri'
+require 'pathname'
 ARGV.each do |u|
 	uri = URI( u )
@@ -18,6 +19,12 @@ ARGV.each do |u|
 	when 'renzaburo.jp'
 		require 'aozoragen/renzaburo'
 		book = Renzaburo::new( uri )
+	when 'github.com'
+		case Pathname( uri.path ).basename.to_s.sub( %r|(.*?)-.*$|, '\1' )
+		when 'webmysteries'
+			require 'aozoragen/webmysteries'
+			book = Webmysteries::new( uri )
+		end
 	else
 		p "Error: unknown URI: #{uri}"
 	end

data/lib/aozoragen/renzaburo.rb CHANGED

@@ -39,15 +39,18 @@ class Renzaburo
 		(Nokogiri( html ) / 'div#mainContent' ).each do |content|
 			(content / 'h3').each do |t|
 				title = t.text.sub( /^『#{book_title}』　/, '' )
-				text << "\n　　　　　#{title}\n\n"
+				text << "\n［＃小見出し］#{title}［＃小見出し終わり］\n\n"
 			end
 			(content / 'div.textBlock p' ).each do |para|
 				next if /＜次回につづく＞/ =~ para.text
-				text << '　' * 10 if (para.attr('class') || '').index( 'txtAlignC' )
+				text << '［＃１０字下げ］' if (para.attr('class') || '').index( 'txtAlignC' )
 				text << para.text.gsub( /<br>/, "\n" ) << "\n\n"
 			end
 		end
 		text << "［＃改ページ］\n"
-		text.gsub( /＜/, '〈' ).gsub( /＞/, '〉' )
+		text.gsub( /＜/, '〈' ).gsub( /＞/, '〉' ).tr(
+			'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789',
+			'ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ０１２３４５６７８９'
+		)
 	end
 end

data/lib/aozoragen/sai-zen-sen.rb CHANGED

@@ -32,11 +32,11 @@ class SaiZenSen
 				page.children.each do |section|
 					case section.name
 					when 'hgroup'
-						text << "\n　　　　　#{detag section}\n\n"
+						text << "\n［＃小見出し］#{detag section}［＃小見出し終わり］\n\n"
 					when 'div'
 						case section.attr( 'class' )
 						when /delimiter/
-							text << '　' * 5 << '─' * 10 << "\n\n"
+							text << "［＃５字下げ］#{'─' * 10}\n\n"
 						when /pgroup/
 							(section / 'p').each do |paragraph|
 								text << "　#{detag paragraph}\n"

data/lib/aozoragen/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Aozoragen
-  VERSION = "0.0.4"
+  VERSION = "0.0.6"
 end

data/lib/aozoragen/webmysteries.rb ADDED

@@ -0,0 +1,68 @@
+# -*- coding: utf-8; -*-
+#
+# scraping webmysteries.jp
+#
+require 'nokogiri'
+require 'open-uri'
+require 'pathname'
+require 'cgi'
+class Webmysteries
+	def initialize( index_uri )
+		@index_uri = URI( index_uri )
+		@index_html = Nokogiri( open( @index_uri, 'r:UTF-8', &:read ) )
+	end
+	def metainfo
+		info = {:id => Pathname( @index_uri.path ).basename.sub( %r|.*?-(.*)$|, '\1' ).to_s}
+		info[:title] = (@index_html / '#wiki-body h1')[0].text
+		(@index_html / '#wiki-body h2 + ul li' ).each do |li|
+			info[:author] = [li.text]
+		end
+		info
+	end
+	def each_chapter
+		(@index_html / '#wiki-body h3 + ul li a' ).each_with_index do |a, i|
+			uri = URI( a.attr( 'href' ) )
+			text = "\n"
+			each_pages( uri ) do |page|
+				text << page
+			end
+			yield( {id: '%02d' % (i+1), uri: uri, text: text} )
+		end
+	end
+	def each_pages( index )
+		begin
+			pages = []
+			html = Nokogiri( open( index, 'r', &:read ) )
+			(html / 'ul.pageNavi a').each do |a|
+				pages << a.attr( 'href' )
+			end
+			pages.shift	# delete current page
+			begin
+				(html / 'noscript param[name="FlashVars"]')[0].attr( 'value' ).scan( /entry=(\d+)/ ) do |i|
+					yield get_text( i[0] )
+				end
+			end while html = Nokogiri( open( pages.shift, 'r', &:read ) )
+		rescue TypeError
+			# ignore open nil
+		end
+	end
+	def get_text( xml_id )
+		result = ''
+		open( "http://www.webmysteries.jp/entry_xml_data/#{xml_id}.xml" ) do |fx|
+			CGI::unescape( fx.read ).scan( %r|<entryBody>(.*?)</entryBody>|m ) do |entry|
+				result << entry[0].gsub( %r|<.*?>|m, "" )
+			end
+		end
+		result.
+			gsub( /^.*（つづく）.*$/, '［＃改ページ］' ).
+			gsub( /(?<=.)（([あ-ん]+)）/, '《\1》' ).
+			gsub( /‐/, '─' ).
+			gsub( /\uFF0D/, '─' ).
+			gsub( /\n{3,}/m, "\n\n" )
+	end
+end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: aozoragen
 version: !ruby/object:Gem::Version
-  version: 0.0.4
+  version: 0.0.6
   prerelease:
 platform: ruby
 authors:
@@ -9,11 +9,11 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-02-24 00:00:00.000000000 Z
+date: 2012-02-27 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
-  requirement: &85482890 !ruby/object:Gem::Requirement
+  requirement: &81978730 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *85482890
+  version_requirements: *81978730
 description: Scraping some Ebook web site and generating AOZORA format text files.
 email:
 - t@tdtds.jp
@@ -42,6 +42,7 @@ files:
 - lib/aozoragen/renzaburo.rb
 - lib/aozoragen/sai-zen-sen.rb
 - lib/aozoragen/version.rb
+- lib/aozoragen/webmysteries.rb
 homepage: https://github.com/tdtds/aozoragen
 licenses: []
 post_install_message: