aozoragen 0.0.6 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/aozoragen CHANGED
@@ -6,37 +6,71 @@
6
6
  # Copyright (C) 2012 by TADA Tadashi <t@tdtds.jp>
7
7
  # Distributed under GPL
8
8
  #
9
+ require 'optparse'
9
10
  require 'uri'
10
11
  require 'pathname'
11
12
 
13
+ opts = {
14
+ overwrite: false,
15
+ output: '.',
16
+ verbose: false,
17
+ }
18
+ OptionParser.new do |o|
19
+ o.banner = 'Usage: aozoragen [options] <URI...>'
20
+ o.on( '-f', '--overwrite', 'force overwrite existent files.' ){|b| opts[:overwrite] = b}
21
+ o.on( '-O DIR', '--output DIR', 'specify output directory.' ){|dir| opts[:output] = dir}
22
+ o.on( '-v', '--verbose', 'show progress messages.' ){|b| opts[:verbose] = b}
23
+
24
+ begin
25
+ o.parse!
26
+ rescue OptionParser::InvalidOption
27
+ puts "invalid option\n\n#{o}"
28
+ exit -1
29
+ end
30
+ end
31
+
32
+ def write_open( file, opts )
33
+ name = "#{opts[:output]}/#{file}"
34
+ if !opts[:overwrite] && FileTest::exist?( name )
35
+ puts "Skipping write #{file}." if opts[:verbose]
36
+ return
37
+ end
38
+ open( name, 'w' ){|w| yield w}
39
+ puts name if opts[:verbose]
40
+ end
41
+
12
42
  ARGV.each do |u|
13
43
  uri = URI( u )
14
- book = nil
15
- case uri.host
16
- when 'sai-zen-sen.jp'
17
- require 'aozoragen/sai-zen-sen'
18
- book = SaiZenSen::new( uri )
19
- when 'renzaburo.jp'
20
- require 'aozoragen/renzaburo'
21
- book = Renzaburo::new( uri )
22
- when 'github.com'
23
- case Pathname( uri.path ).basename.to_s.sub( %r|(.*?)-.*$|, '\1' )
24
- when 'webmysteries'
25
- require 'aozoragen/webmysteries'
26
- book = Webmysteries::new( uri )
44
+ service = case uri.host
45
+ when 'sai-zen-sen.jp'
46
+ 'sai-zen-sen'
47
+ when 'renzaburo.jp'
48
+ 'renzaburo'
49
+ when 'github.com'
50
+ case Pathname( uri.path ).basename.to_s.sub( %r|(.*?)-.*$|, '\1' )
51
+ when 'webmysteries'
52
+ 'webmisteries'
53
+ else
54
+ nil
55
+ end
56
+ else
57
+ nil
27
58
  end
28
- else
29
- p "Error: unknown URI: #{uri}"
59
+ unless service
60
+ puts "Skipping unknown URI: #{uri}"
61
+ next
30
62
  end
31
- next unless book
63
+
64
+ require "aozoragen/#{service}"
65
+ book = (Aozoragen.const_get service.split(/-/).map{|s| s.capitalize}.join)::new( uri )
32
66
 
33
67
  meta = book.metainfo
34
- open( "#{meta[:id]}.00.txt", 'w' ) do |w|
68
+ write_open( "#{meta[:id]}.00.txt", opts ) do |w|
35
69
  w.puts "#{meta[:title]}\n#{meta[:author].join ' / '}\n\n\n[#改ページ]"
36
70
  end
37
71
 
38
72
  book.each_chapter do |chapter|
39
- open( "#{meta[:id]}.#{chapter[:id]}.txt", 'w' ) do |w|
73
+ write_open( "#{meta[:id]}.#{chapter[:id]}.txt", opts ) do |w|
40
74
  w.puts chapter[:text]
41
75
  end
42
76
  end
@@ -2,55 +2,55 @@
2
2
  #
3
3
  # scraping renzaburo.jp
4
4
  #
5
- require 'nokogiri'
5
+ require 'aozoragen/util'
6
6
  require 'open-uri'
7
7
  require 'pathname'
8
8
 
9
- class Renzaburo
10
- def initialize( index_uri )
11
- @index_uri = URI( index_uri.to_s.sub( /index\.html$/, '' ) )
12
- @index_html = Nokogiri( open( @index_uri, 'r:CP932', &:read ) )
13
- end
9
+ module Aozoragen
10
+ class Renzaburo
11
+ include Util
14
12
 
15
- def metainfo
16
- info = {:id => Pathname( @index_uri.path ).basename.to_s}
17
- (@index_html / 'title').each do |t|
18
- info[:title] = t.text.sub( /|.*/, '' )
19
- end
20
- (@index_html / 'div.textBlock strong' ).each do |st|
21
- info[:author] = [st.text]
13
+ def initialize( index_uri )
14
+ @index_uri = URI( index_uri.to_s.sub( /index\.html$/, '' ) )
15
+ @index_html = Nokogiri( open( @index_uri, 'r:CP932', &:read ) )
22
16
  end
23
- info
24
- end
25
-
26
- def each_chapter
27
- book_title = metainfo[:title]
28
- (@index_html / 'ul.btnList li.withDate a' ).each do |a|
29
- uri = @index_uri + a.attr( :href )
30
- text = get_content( uri, book_title )
31
- yield( {id: Pathname( uri.path ).dirname.basename.to_s, uri: uri, text: text} )
17
+
18
+ def metainfo
19
+ info = {:id => Pathname( @index_uri.path ).basename.to_s}
20
+ (@index_html / 'title').each do |t|
21
+ info[:title] = t.text.sub( /|.*/, '' )
22
+ end
23
+ (@index_html / 'div.textBlock strong' ).each do |st|
24
+ info[:author] = [st.text]
25
+ end
26
+ info
32
27
  end
33
- end
34
-
35
- def get_content( uri, book_title = '' )
36
- text = ''
37
- html = open( uri, 'r:CP932', &:read ).encode( 'UTF-8' )
38
- html = html.gsub( /\&mdash;/, "\u2500" ).gsub( /\&quot;/, "\u201D" )
39
- (Nokogiri( html ) / 'div#mainContent' ).each do |content|
40
- (content / 'h3').each do |t|
41
- title = t.text.sub( /^『#{book_title}』 /, '' )
42
- text << "\n[#小見出し]#{title}[#小見出し終わり]\n\n"
28
+
29
+ def each_chapter
30
+ book_title = metainfo[:title]
31
+ (@index_html / 'ul.btnList li.withDate a' ).each do |a|
32
+ uri = @index_uri + a.attr( :href )
33
+ text = get_content( uri, book_title )
34
+ yield( {id: Pathname( uri.path ).dirname.basename.to_s, uri: uri, text: text} )
43
35
  end
44
- (content / 'div.textBlock p' ).each do |para|
45
- next if /<次回につづく>/ =~ para.text
46
- text << '[#10字下げ]' if (para.attr('class') || '').index( 'txtAlignC' )
47
- text << para.text.gsub( /<br>/, "\n" ) << "\n\n"
36
+ end
37
+
38
+ def get_content( uri, book_title = '' )
39
+ text = ''
40
+ html = open( uri, 'r:CP932', &:read ).encode( 'UTF-8' )
41
+ html = html.gsub( /\&mdash;/, "\u2500" ).gsub( /\&quot;/, "\u201D" )
42
+ (Nokogiri( html ) / 'div#mainContent' ).each do |content|
43
+ (content / 'h3').each do |t|
44
+ text << t.text.sub( /^『#{book_title}』 /, '' ).subhead
45
+ end
46
+ (content / 'div.textBlock p' ).each do |para|
47
+ next if /<次回につづく>/ =~ para.text
48
+ text << '[#10字下げ]' if (para.attr('class') || '').index( 'txtAlignC' )
49
+ text << para.text.gsub( /<br>/, "\n" ) << "\n\n"
50
+ end
48
51
  end
52
+ text << "[#改ページ]\n"
53
+ text.for_tategaki
49
54
  end
50
- text << "[#改ページ]\n"
51
- text.gsub( /</, '〈' ).gsub( />/, '〉' ).tr(
52
- 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789',
53
- 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'
54
- )
55
55
  end
56
56
  end
@@ -2,37 +2,54 @@
2
2
  #
3
3
  # scraping sai-zen-sen.jp
4
4
  #
5
- require 'nokogiri'
5
+ require 'aozoragen/util'
6
6
  require 'open-uri'
7
7
  require 'pathname'
8
8
 
9
- class SaiZenSen
10
- def initialize( index_uri )
11
- if index_uri.path == '/sa/fate-zero/works/'
12
- @entity = SaiZenSenFateZero::new( index_uri )
13
- else
14
- @entity = SaiZenSenRegular::new( index_uri )
15
- end
16
- end
9
+ module Aozoragen
10
+ class SaiZenSen
11
+ include Util
17
12
 
18
- def metainfo
19
- @entity.metainfo
20
- end
21
-
22
- def each_chapter
23
- @entity.each_chapter{|c| yield c}
24
- end
13
+ def initialize( index_uri )
14
+ case index_uri.path
15
+ when '/sa/fate-zero/works/'
16
+ @entity = SaiZenSenFateZero::new( index_uri )
17
+ when %r|/01.html$| # short story
18
+ @entity = SaiZenSenShort::new( index_uri )
19
+ else
20
+ @entity = SaiZenSenRegular::new( index_uri )
21
+ end
22
+ end
23
+
24
+ def metainfo
25
+ @entity.metainfo
26
+ end
27
+
28
+ def each_chapter
29
+ @entity.each_chapter do |c|
30
+ c[:text] = c[:text].normalize_char
31
+ yield c
32
+ end
33
+ end
34
+
35
+ def each_chapter_local( selector )
36
+ (@index_html / selector).each do |a|
37
+ uri = @index_uri + a.attr('href')
38
+ next if uri.path == '/entryguide.html' # skipping member only contents.
25
39
 
26
- def each_chapter_local( selector )
27
- (@index_html / selector).each do |a|
28
- uri = @index_uri + a.attr('href')
29
- chapter = Nokogiri( open( uri, 'r:utf-8', &:read ) )
40
+ chapter = Nokogiri( open( uri, 'r:utf-8', &:read ) )
41
+ text = get_chapter_text( chapter )
42
+ yield( {id: Pathname( uri.path ).dirname.basename.to_s, uri: uri, text: text} )
43
+ end
44
+ end
45
+
46
+ def get_chapter_text( chapter )
30
47
  text = ''
31
48
  (chapter / 'section.book-page-spread').each do |page|
32
49
  page.children.each do |section|
33
50
  case section.name
34
51
  when 'hgroup'
35
- text << "\n[#小見出し]#{detag section}[#小見出し終わり]\n\n"
52
+ text << detag( section ).subhead
36
53
  when 'div'
37
54
  case section.attr( 'class' )
38
55
  when /delimiter/
@@ -54,61 +71,69 @@ class SaiZenSen
54
71
  end
55
72
  text << "[#改ページ]\n"
56
73
  end
57
-
58
- yield( {id: Pathname( uri.path ).dirname.basename.to_s, uri: uri, text: text} )
74
+ text
59
75
  end
60
76
  end
61
-
62
- def detag( elem )
63
- (elem / 'ruby rp').each do |rp|
64
- case rp.text
65
- when ''
66
- rp.inner_html = '《'
67
- when ')'
68
- rp.inner_html = '》'
77
+
78
+ class SaiZenSenRegular < SaiZenSen
79
+ def initialize( index_uri )
80
+ @index_uri = index_uri
81
+ @index_html = Nokogiri( open( @index_uri, 'r:utf-8', &:read ) )
82
+ end
83
+
84
+ def metainfo
85
+ info = {:id => Pathname( @index_uri.path ).basename.to_s, :author => []}
86
+ info[:title] = (@index_html / '#page-content-heading h1')[0].text
87
+ (@index_html / '#authors h3').each do |author|
88
+ info[:author] << author.text.sub( /.*? /, '' )
69
89
  end
90
+ info
70
91
  end
71
- elem.to_html.gsub( /<.*?>/, '' ).strip
72
- end
73
- end
74
-
75
- class SaiZenSenRegular < SaiZenSen
76
- def initialize( index_uri )
77
- @index_uri = index_uri
78
- @index_html = Nokogiri( open( @index_uri, 'r:utf-8', &:read ) )
79
- end
80
-
81
- def metainfo
82
- info = {:id => Pathname( @index_uri.path ).basename.to_s, :author => []}
83
- info[:title] = (@index_html / '#page-content-heading h1')[0].text
84
- (@index_html / '#authors h3').each do |author|
85
- info[:author] << author.text.sub( /.*? /, '' )
92
+
93
+ def each_chapter
94
+ each_chapter_local( '#back-numbers li a' ){|c| yield c}
86
95
  end
87
- info
88
96
  end
89
-
90
- def each_chapter
91
- each_chapter_local( '#back-numbers li a' ){|c| yield c}
92
- end
93
- end
94
-
95
- class SaiZenSenFateZero < SaiZenSen
96
- def initialize( index_uri )
97
- @index_uri = index_uri
98
- @index_html = Nokogiri( open( @index_uri, 'r:utf-8', &:read ) )
99
- end
100
-
101
- def metainfo
102
- info = {
103
- :id => Pathname( @index_uri.path ).basename.to_s,
104
- :author => ['虚淵玄']
105
- }
106
- info[:title] = (@index_html / 'h1 img')[0].attr( 'alt' )
107
- info
97
+
98
+ class SaiZenSenShort < SaiZenSen
99
+ def initialize( index_uri )
100
+ @index_uri = index_uri
101
+ @index_html = Nokogiri( open( @index_uri, 'r:utf-8', &:read ) )
102
+ end
103
+
104
+ def metainfo
105
+ info = {:id => Pathname( @index_uri.path ).dirname.dirname.basename.to_s, :author => []}
106
+ info[:title] = (@index_html / 'h1.book-title')[0].text
107
+ (@index_html / 'h2.book-author strong').each do |author|
108
+ info[:author] << author.text
109
+ end
110
+ info
111
+ end
112
+
113
+ def each_chapter
114
+ text = get_chapter_text( @index_html )
115
+ yield( {id: Pathname( @index_uri.path ).basename( '.html' ).to_s, uri: @index_uri, text: text} )
116
+ end
108
117
  end
109
-
110
- def each_chapter
111
- each_chapter_local( 'article a' ){|c| yield c}
118
+
119
+ class SaiZenSenFateZero < SaiZenSen
120
+ def initialize( index_uri )
121
+ @index_uri = index_uri
122
+ @index_html = Nokogiri( open( @index_uri, 'r:utf-8', &:read ) )
123
+ end
124
+
125
+ def metainfo
126
+ info = {
127
+ :id => Pathname( @index_uri.path ).basename.to_s,
128
+ :author => ['虚淵玄']
129
+ }
130
+ info[:title] = (@index_html / 'h1 img')[0].attr( 'alt' )
131
+ info
132
+ end
133
+
134
+ def each_chapter
135
+ each_chapter_local( 'article a' ){|c| yield c}
136
+ end
137
+
112
138
  end
113
-
114
139
  end
@@ -0,0 +1,69 @@
1
+ # -*- coding: utf-8; -*-
2
+ #
3
+ # utility methods for converting to AOZORA format
4
+ #
5
+ require 'nokogiri'
6
+
7
+ ##
8
+ # Enhanced String methods for converting to AOZORA format
9
+ #
10
+ class String
11
+ ##
12
+ # Half width of Alphabet and Digit to Full width.
13
+ #
14
+ def han2zen
15
+ self.tr( 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789',
16
+ 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789' )
17
+ end
18
+
19
+ ##
20
+ # replace characters fitting to vertical lyout
21
+ #
22
+ def for_tategaki
23
+ self.tr( '<>‐', '〈〉─' ).han2zen
24
+ end
25
+
26
+ ##
27
+ # section heading format of Aozora
28
+ #
29
+ def subhead
30
+ self.split( /\n/ ).map{|x|
31
+ "\n[#小見出し]#{x}[#小見出し終わり]"
32
+ }.join + "\n\n"
33
+ end
34
+
35
+ ##
36
+ # normalize invalid charcters
37
+ #
38
+ def normalize_char
39
+ self.tr( "\u6451\u5653\u7e6b\uFF0D/", '掴嘘繋─' )
40
+ end
41
+ end
42
+
43
+ module Aozoragen
44
+ ##
45
+ # Utility methods for Aozora format
46
+ #
47
+ module Util
48
+ ##
49
+ # delete HTML tags
50
+ #
51
+ def detag( elem )
52
+ # ruby tags
53
+ (elem / 'ruby rp').each do |rp|
54
+ case rp.text
55
+ when '('
56
+ rp.inner_html = '《'
57
+ when ')'
58
+ rp.inner_html = '》'
59
+ end
60
+ end
61
+
62
+ # delete tgas
63
+ elem.to_html.
64
+ gsub( /<br>/, "\n" ).
65
+ gsub( /<.*?>/, '' ).
66
+ strip
67
+ end
68
+ end
69
+ end
@@ -1,3 +1,3 @@
1
1
  module Aozoragen
2
- VERSION = "0.0.6"
2
+ VERSION = "0.1.0"
3
3
  end
@@ -2,67 +2,71 @@
2
2
  #
3
3
  # scraping webmysteries.jp
4
4
  #
5
- require 'nokogiri'
5
+ require 'aozoragen/util'
6
6
  require 'open-uri'
7
7
  require 'pathname'
8
8
  require 'cgi'
9
9
 
10
- class Webmysteries
11
- def initialize( index_uri )
12
- @index_uri = URI( index_uri )
13
- @index_html = Nokogiri( open( @index_uri, 'r:UTF-8', &:read ) )
14
- end
10
+ module Aozoragen
11
+ class Webmysteries
12
+ include Util
15
13
 
16
- def metainfo
17
- info = {:id => Pathname( @index_uri.path ).basename.sub( %r|.*?-(.*)$|, '\1' ).to_s}
18
- info[:title] = (@index_html / '#wiki-body h1')[0].text
19
- (@index_html / '#wiki-body h2 + ul li' ).each do |li|
20
- info[:author] = [li.text]
14
+ def initialize( index_uri )
15
+ @index_uri = URI( index_uri )
16
+ @index_html = Nokogiri( open( @index_uri, 'r:UTF-8', &:read ) )
21
17
  end
22
- info
23
- end
24
-
25
- def each_chapter
26
- (@index_html / '#wiki-body h3 + ul li a' ).each_with_index do |a, i|
27
- uri = URI( a.attr( 'href' ) )
28
- text = "\n"
29
- each_pages( uri ) do |page|
30
- text << page
18
+
19
+ def metainfo
20
+ info = {:id => Pathname( @index_uri.path ).basename.sub( %r|.*?-(.*)$|, '\1' ).to_s}
21
+ info[:title] = (@index_html / '#wiki-body h1')[0].text
22
+ (@index_html / '#wiki-body h2 + ul li' ).each do |li|
23
+ info[:author] = [li.text]
31
24
  end
32
- yield( {id: '%02d' % (i+1), uri: uri, text: text} )
25
+ info
33
26
  end
34
- end
35
-
36
- def each_pages( index )
37
- begin
38
- pages = []
39
- html = Nokogiri( open( index, 'r', &:read ) )
40
- (html / 'ul.pageNavi a').each do |a|
41
- pages << a.attr( 'href' )
27
+
28
+ def each_chapter
29
+ (@index_html / '#wiki-body h3 + ul li a' ).each_with_index do |a, i|
30
+ uri = URI( a.attr( 'href' ) )
31
+ text = "\n"
32
+ each_pages( uri ) do |page|
33
+ text << page
34
+ end
35
+ yield( {id: '%02d' % (i+1), uri: uri, text: text} )
42
36
  end
43
- pages.shift # delete current page
37
+ end
38
+
39
+ def each_pages( index )
44
40
  begin
45
- (html / 'noscript param[name="FlashVars"]')[0].attr( 'value' ).scan( /entry=(\d+)/ ) do |i|
46
- yield get_text( i[0] )
41
+ pages = []
42
+ html = Nokogiri( open( index, 'r', &:read ) )
43
+ (html / 'ul.pageNavi a').each do |a|
44
+ pages << a.attr( 'href' )
47
45
  end
48
- end while html = Nokogiri( open( pages.shift, 'r', &:read ) )
49
- rescue TypeError
50
- # ignore open nil
46
+ pages.shift # delete current page
47
+ begin
48
+ (html / 'noscript param[name="FlashVars"]')[0].attr( 'value' ).scan( /entry=(\d+)/ ) do |i|
49
+ yield get_text( i[0] )
50
+ end
51
+ end while html = Nokogiri( open( pages.shift, 'r', &:read ) )
52
+ rescue TypeError
53
+ # ignore open nil
54
+ end
51
55
  end
52
- end
53
-
54
- def get_text( xml_id )
55
- result = ''
56
- open( "http://www.webmysteries.jp/entry_xml_data/#{xml_id}.xml" ) do |fx|
57
- CGI::unescape( fx.read ).scan( %r|<entryBody>(.*?)</entryBody>|m ) do |entry|
58
- result << entry[0].gsub( %r|<.*?>|m, "" )
56
+
57
+ def get_text( xml_id )
58
+ result = ''
59
+ open( "http://www.webmysteries.jp/entry_xml_data/#{xml_id}.xml" ) do |fx|
60
+ CGI::unescape( fx.read ).scan( %r|<entryBody>(.*?)</entryBody>|m ) do |entry|
61
+ result << entry[0].gsub( %r|<.*?>|m, "" )
62
+ end
59
63
  end
64
+ result.
65
+ gsub( /^.*(つづく).*$/, '[#改ページ]' ).
66
+ gsub( /(?<=.)(([あ-ん]+))/, '《\1》' ).
67
+ gsub( /\n{3,}/m, "\n\n" ).
68
+ for_tategaki.
69
+ normalize_char
60
70
  end
61
- result.
62
- gsub( /^.*(つづく).*$/, '[#改ページ]' ).
63
- gsub( /(?<=.)(([あ-ん]+))/, '《\1》' ).
64
- gsub( /‐/, '─' ).
65
- gsub( /\uFF0D/, '─' ).
66
- gsub( /\n{3,}/m, "\n\n" )
67
71
  end
68
72
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aozoragen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-02-27 00:00:00.000000000 Z
12
+ date: 2012-02-29 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
16
- requirement: &81978730 !ruby/object:Gem::Requirement
16
+ requirement: &72469120 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *81978730
24
+ version_requirements: *72469120
25
25
  description: Scraping some Ebook web site and generating AOZORA format text files.
26
26
  email:
27
27
  - t@tdtds.jp
@@ -41,6 +41,7 @@ files:
41
41
  - lib/aozoragen.rb
42
42
  - lib/aozoragen/renzaburo.rb
43
43
  - lib/aozoragen/sai-zen-sen.rb
44
+ - lib/aozoragen/util.rb
44
45
  - lib/aozoragen/version.rb
45
46
  - lib/aozoragen/webmysteries.rb
46
47
  homepage: https://github.com/tdtds/aozoragen