aozoragen 0.0.6 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/bin/aozoragen CHANGED
@@ -6,37 +6,71 @@
6
6
  # Copyright (C) 2012 by TADA Tadashi <t@tdtds.jp>
7
7
  # Distributed under GPL
8
8
  #
9
+ require 'optparse'
9
10
  require 'uri'
10
11
  require 'pathname'
11
12
 
13
+ opts = {
14
+ overwrite: false,
15
+ output: '.',
16
+ verbose: false,
17
+ }
18
+ OptionParser.new do |o|
19
+ o.banner = 'Usage: aozoragen [options] <URI...>'
20
+ o.on( '-f', '--overwrite', 'force overwrite existent files.' ){|b| opts[:overwrite] = b}
21
+ o.on( '-O DIR', '--output DIR', 'specify output directory.' ){|dir| opts[:output] = dir}
22
+ o.on( '-v', '--verbose', 'show progress messages.' ){|b| opts[:verbose] = b}
23
+
24
+ begin
25
+ o.parse!
26
+ rescue OptionParser::InvalidOption
27
+ puts "invalid option\n\n#{o}"
28
+ exit -1
29
+ end
30
+ end
31
+
32
+ def write_open( file, opts )
33
+ name = "#{opts[:output]}/#{file}"
34
+ if !opts[:overwrite] && FileTest::exist?( name )
35
+ puts "Skipping write #{file}." if opts[:verbose]
36
+ return
37
+ end
38
+ open( name, 'w' ){|w| yield w}
39
+ puts name if opts[:verbose]
40
+ end
41
+
12
42
  ARGV.each do |u|
13
43
  uri = URI( u )
14
- book = nil
15
- case uri.host
16
- when 'sai-zen-sen.jp'
17
- require 'aozoragen/sai-zen-sen'
18
- book = SaiZenSen::new( uri )
19
- when 'renzaburo.jp'
20
- require 'aozoragen/renzaburo'
21
- book = Renzaburo::new( uri )
22
- when 'github.com'
23
- case Pathname( uri.path ).basename.to_s.sub( %r|(.*?)-.*$|, '\1' )
24
- when 'webmysteries'
25
- require 'aozoragen/webmysteries'
26
- book = Webmysteries::new( uri )
44
+ service = case uri.host
45
+ when 'sai-zen-sen.jp'
46
+ 'sai-zen-sen'
47
+ when 'renzaburo.jp'
48
+ 'renzaburo'
49
+ when 'github.com'
50
+ case Pathname( uri.path ).basename.to_s.sub( %r|(.*?)-.*$|, '\1' )
51
+ when 'webmysteries'
52
+ 'webmisteries'
53
+ else
54
+ nil
55
+ end
56
+ else
57
+ nil
27
58
  end
28
- else
29
- p "Error: unknown URI: #{uri}"
59
+ unless service
60
+ puts "Skipping unknown URI: #{uri}"
61
+ next
30
62
  end
31
- next unless book
63
+
64
+ require "aozoragen/#{service}"
65
+ book = (Aozoragen.const_get service.split(/-/).map{|s| s.capitalize}.join)::new( uri )
32
66
 
33
67
  meta = book.metainfo
34
- open( "#{meta[:id]}.00.txt", 'w' ) do |w|
68
+ write_open( "#{meta[:id]}.00.txt", opts ) do |w|
35
69
  w.puts "#{meta[:title]}\n#{meta[:author].join ' / '}\n\n\n[#改ページ]"
36
70
  end
37
71
 
38
72
  book.each_chapter do |chapter|
39
- open( "#{meta[:id]}.#{chapter[:id]}.txt", 'w' ) do |w|
73
+ write_open( "#{meta[:id]}.#{chapter[:id]}.txt", opts ) do |w|
40
74
  w.puts chapter[:text]
41
75
  end
42
76
  end
@@ -2,55 +2,55 @@
2
2
  #
3
3
  # scraping renzaburo.jp
4
4
  #
5
- require 'nokogiri'
5
+ require 'aozoragen/util'
6
6
  require 'open-uri'
7
7
  require 'pathname'
8
8
 
9
- class Renzaburo
10
- def initialize( index_uri )
11
- @index_uri = URI( index_uri.to_s.sub( /index\.html$/, '' ) )
12
- @index_html = Nokogiri( open( @index_uri, 'r:CP932', &:read ) )
13
- end
9
+ module Aozoragen
10
+ class Renzaburo
11
+ include Util
14
12
 
15
- def metainfo
16
- info = {:id => Pathname( @index_uri.path ).basename.to_s}
17
- (@index_html / 'title').each do |t|
18
- info[:title] = t.text.sub( /|.*/, '' )
19
- end
20
- (@index_html / 'div.textBlock strong' ).each do |st|
21
- info[:author] = [st.text]
13
+ def initialize( index_uri )
14
+ @index_uri = URI( index_uri.to_s.sub( /index\.html$/, '' ) )
15
+ @index_html = Nokogiri( open( @index_uri, 'r:CP932', &:read ) )
22
16
  end
23
- info
24
- end
25
-
26
- def each_chapter
27
- book_title = metainfo[:title]
28
- (@index_html / 'ul.btnList li.withDate a' ).each do |a|
29
- uri = @index_uri + a.attr( :href )
30
- text = get_content( uri, book_title )
31
- yield( {id: Pathname( uri.path ).dirname.basename.to_s, uri: uri, text: text} )
17
+
18
+ def metainfo
19
+ info = {:id => Pathname( @index_uri.path ).basename.to_s}
20
+ (@index_html / 'title').each do |t|
21
+ info[:title] = t.text.sub( /|.*/, '' )
22
+ end
23
+ (@index_html / 'div.textBlock strong' ).each do |st|
24
+ info[:author] = [st.text]
25
+ end
26
+ info
32
27
  end
33
- end
34
-
35
- def get_content( uri, book_title = '' )
36
- text = ''
37
- html = open( uri, 'r:CP932', &:read ).encode( 'UTF-8' )
38
- html = html.gsub( /\&mdash;/, "\u2500" ).gsub( /\&quot;/, "\u201D" )
39
- (Nokogiri( html ) / 'div#mainContent' ).each do |content|
40
- (content / 'h3').each do |t|
41
- title = t.text.sub( /^『#{book_title}』 /, '' )
42
- text << "\n[#小見出し]#{title}[#小見出し終わり]\n\n"
28
+
29
+ def each_chapter
30
+ book_title = metainfo[:title]
31
+ (@index_html / 'ul.btnList li.withDate a' ).each do |a|
32
+ uri = @index_uri + a.attr( :href )
33
+ text = get_content( uri, book_title )
34
+ yield( {id: Pathname( uri.path ).dirname.basename.to_s, uri: uri, text: text} )
43
35
  end
44
- (content / 'div.textBlock p' ).each do |para|
45
- next if /<次回につづく>/ =~ para.text
46
- text << '[#10字下げ]' if (para.attr('class') || '').index( 'txtAlignC' )
47
- text << para.text.gsub( /<br>/, "\n" ) << "\n\n"
36
+ end
37
+
38
+ def get_content( uri, book_title = '' )
39
+ text = ''
40
+ html = open( uri, 'r:CP932', &:read ).encode( 'UTF-8' )
41
+ html = html.gsub( /\&mdash;/, "\u2500" ).gsub( /\&quot;/, "\u201D" )
42
+ (Nokogiri( html ) / 'div#mainContent' ).each do |content|
43
+ (content / 'h3').each do |t|
44
+ text << t.text.sub( /^『#{book_title}』 /, '' ).subhead
45
+ end
46
+ (content / 'div.textBlock p' ).each do |para|
47
+ next if /<次回につづく>/ =~ para.text
48
+ text << '[#10字下げ]' if (para.attr('class') || '').index( 'txtAlignC' )
49
+ text << para.text.gsub( /<br>/, "\n" ) << "\n\n"
50
+ end
48
51
  end
52
+ text << "[#改ページ]\n"
53
+ text.for_tategaki
49
54
  end
50
- text << "[#改ページ]\n"
51
- text.gsub( /</, '〈' ).gsub( />/, '〉' ).tr(
52
- 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789',
53
- 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'
54
- )
55
55
  end
56
56
  end
@@ -2,37 +2,54 @@
2
2
  #
3
3
  # scraping sai-zen-sen.jp
4
4
  #
5
- require 'nokogiri'
5
+ require 'aozoragen/util'
6
6
  require 'open-uri'
7
7
  require 'pathname'
8
8
 
9
- class SaiZenSen
10
- def initialize( index_uri )
11
- if index_uri.path == '/sa/fate-zero/works/'
12
- @entity = SaiZenSenFateZero::new( index_uri )
13
- else
14
- @entity = SaiZenSenRegular::new( index_uri )
15
- end
16
- end
9
+ module Aozoragen
10
+ class SaiZenSen
11
+ include Util
17
12
 
18
- def metainfo
19
- @entity.metainfo
20
- end
21
-
22
- def each_chapter
23
- @entity.each_chapter{|c| yield c}
24
- end
13
+ def initialize( index_uri )
14
+ case index_uri.path
15
+ when '/sa/fate-zero/works/'
16
+ @entity = SaiZenSenFateZero::new( index_uri )
17
+ when %r|/01.html$| # short story
18
+ @entity = SaiZenSenShort::new( index_uri )
19
+ else
20
+ @entity = SaiZenSenRegular::new( index_uri )
21
+ end
22
+ end
23
+
24
+ def metainfo
25
+ @entity.metainfo
26
+ end
27
+
28
+ def each_chapter
29
+ @entity.each_chapter do |c|
30
+ c[:text] = c[:text].normalize_char
31
+ yield c
32
+ end
33
+ end
34
+
35
+ def each_chapter_local( selector )
36
+ (@index_html / selector).each do |a|
37
+ uri = @index_uri + a.attr('href')
38
+ next if uri.path == '/entryguide.html' # skipping member only contents.
25
39
 
26
- def each_chapter_local( selector )
27
- (@index_html / selector).each do |a|
28
- uri = @index_uri + a.attr('href')
29
- chapter = Nokogiri( open( uri, 'r:utf-8', &:read ) )
40
+ chapter = Nokogiri( open( uri, 'r:utf-8', &:read ) )
41
+ text = get_chapter_text( chapter )
42
+ yield( {id: Pathname( uri.path ).dirname.basename.to_s, uri: uri, text: text} )
43
+ end
44
+ end
45
+
46
+ def get_chapter_text( chapter )
30
47
  text = ''
31
48
  (chapter / 'section.book-page-spread').each do |page|
32
49
  page.children.each do |section|
33
50
  case section.name
34
51
  when 'hgroup'
35
- text << "\n[#小見出し]#{detag section}[#小見出し終わり]\n\n"
52
+ text << detag( section ).subhead
36
53
  when 'div'
37
54
  case section.attr( 'class' )
38
55
  when /delimiter/
@@ -54,61 +71,69 @@ class SaiZenSen
54
71
  end
55
72
  text << "[#改ページ]\n"
56
73
  end
57
-
58
- yield( {id: Pathname( uri.path ).dirname.basename.to_s, uri: uri, text: text} )
74
+ text
59
75
  end
60
76
  end
61
-
62
- def detag( elem )
63
- (elem / 'ruby rp').each do |rp|
64
- case rp.text
65
- when ''
66
- rp.inner_html = '《'
67
- when ')'
68
- rp.inner_html = '》'
77
+
78
+ class SaiZenSenRegular < SaiZenSen
79
+ def initialize( index_uri )
80
+ @index_uri = index_uri
81
+ @index_html = Nokogiri( open( @index_uri, 'r:utf-8', &:read ) )
82
+ end
83
+
84
+ def metainfo
85
+ info = {:id => Pathname( @index_uri.path ).basename.to_s, :author => []}
86
+ info[:title] = (@index_html / '#page-content-heading h1')[0].text
87
+ (@index_html / '#authors h3').each do |author|
88
+ info[:author] << author.text.sub( /.*? /, '' )
69
89
  end
90
+ info
70
91
  end
71
- elem.to_html.gsub( /<.*?>/, '' ).strip
72
- end
73
- end
74
-
75
- class SaiZenSenRegular < SaiZenSen
76
- def initialize( index_uri )
77
- @index_uri = index_uri
78
- @index_html = Nokogiri( open( @index_uri, 'r:utf-8', &:read ) )
79
- end
80
-
81
- def metainfo
82
- info = {:id => Pathname( @index_uri.path ).basename.to_s, :author => []}
83
- info[:title] = (@index_html / '#page-content-heading h1')[0].text
84
- (@index_html / '#authors h3').each do |author|
85
- info[:author] << author.text.sub( /.*? /, '' )
92
+
93
+ def each_chapter
94
+ each_chapter_local( '#back-numbers li a' ){|c| yield c}
86
95
  end
87
- info
88
96
  end
89
-
90
- def each_chapter
91
- each_chapter_local( '#back-numbers li a' ){|c| yield c}
92
- end
93
- end
94
-
95
- class SaiZenSenFateZero < SaiZenSen
96
- def initialize( index_uri )
97
- @index_uri = index_uri
98
- @index_html = Nokogiri( open( @index_uri, 'r:utf-8', &:read ) )
99
- end
100
-
101
- def metainfo
102
- info = {
103
- :id => Pathname( @index_uri.path ).basename.to_s,
104
- :author => ['虚淵玄']
105
- }
106
- info[:title] = (@index_html / 'h1 img')[0].attr( 'alt' )
107
- info
97
+
98
+ class SaiZenSenShort < SaiZenSen
99
+ def initialize( index_uri )
100
+ @index_uri = index_uri
101
+ @index_html = Nokogiri( open( @index_uri, 'r:utf-8', &:read ) )
102
+ end
103
+
104
+ def metainfo
105
+ info = {:id => Pathname( @index_uri.path ).dirname.dirname.basename.to_s, :author => []}
106
+ info[:title] = (@index_html / 'h1.book-title')[0].text
107
+ (@index_html / 'h2.book-author strong').each do |author|
108
+ info[:author] << author.text
109
+ end
110
+ info
111
+ end
112
+
113
+ def each_chapter
114
+ text = get_chapter_text( @index_html )
115
+ yield( {id: Pathname( @index_uri.path ).basename( '.html' ).to_s, uri: @index_uri, text: text} )
116
+ end
108
117
  end
109
-
110
- def each_chapter
111
- each_chapter_local( 'article a' ){|c| yield c}
118
+
119
+ class SaiZenSenFateZero < SaiZenSen
120
+ def initialize( index_uri )
121
+ @index_uri = index_uri
122
+ @index_html = Nokogiri( open( @index_uri, 'r:utf-8', &:read ) )
123
+ end
124
+
125
+ def metainfo
126
+ info = {
127
+ :id => Pathname( @index_uri.path ).basename.to_s,
128
+ :author => ['虚淵玄']
129
+ }
130
+ info[:title] = (@index_html / 'h1 img')[0].attr( 'alt' )
131
+ info
132
+ end
133
+
134
+ def each_chapter
135
+ each_chapter_local( 'article a' ){|c| yield c}
136
+ end
137
+
112
138
  end
113
-
114
139
  end
@@ -0,0 +1,69 @@
1
+ # -*- coding: utf-8; -*-
2
+ #
3
+ # utility methods for converting to AOZORA format
4
+ #
5
+ require 'nokogiri'
6
+
7
+ ##
8
+ # Enhanced String methods for converting to AOZORA format
9
+ #
10
+ class String
11
+ ##
12
+ # Half width of Alphabet and Digit to Full width.
13
+ #
14
+ def han2zen
15
+ self.tr( 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789',
16
+ 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789' )
17
+ end
18
+
19
+ ##
20
+ # replace characters fitting to vertical lyout
21
+ #
22
+ def for_tategaki
23
+ self.tr( '<>‐', '〈〉─' ).han2zen
24
+ end
25
+
26
+ ##
27
+ # section heading format of Aozora
28
+ #
29
+ def subhead
30
+ self.split( /\n/ ).map{|x|
31
+ "\n[#小見出し]#{x}[#小見出し終わり]"
32
+ }.join + "\n\n"
33
+ end
34
+
35
+ ##
36
+ # normalize invalid charcters
37
+ #
38
+ def normalize_char
39
+ self.tr( "\u6451\u5653\u7e6b\uFF0D/", '掴嘘繋─' )
40
+ end
41
+ end
42
+
43
+ module Aozoragen
44
+ ##
45
+ # Utility methods for Aozora format
46
+ #
47
+ module Util
48
+ ##
49
+ # delete HTML tags
50
+ #
51
+ def detag( elem )
52
+ # ruby tags
53
+ (elem / 'ruby rp').each do |rp|
54
+ case rp.text
55
+ when '('
56
+ rp.inner_html = '《'
57
+ when ')'
58
+ rp.inner_html = '》'
59
+ end
60
+ end
61
+
62
+ # delete tgas
63
+ elem.to_html.
64
+ gsub( /<br>/, "\n" ).
65
+ gsub( /<.*?>/, '' ).
66
+ strip
67
+ end
68
+ end
69
+ end
@@ -1,3 +1,3 @@
1
1
  module Aozoragen
2
- VERSION = "0.0.6"
2
+ VERSION = "0.1.0"
3
3
  end
@@ -2,67 +2,71 @@
2
2
  #
3
3
  # scraping webmysteries.jp
4
4
  #
5
- require 'nokogiri'
5
+ require 'aozoragen/util'
6
6
  require 'open-uri'
7
7
  require 'pathname'
8
8
  require 'cgi'
9
9
 
10
- class Webmysteries
11
- def initialize( index_uri )
12
- @index_uri = URI( index_uri )
13
- @index_html = Nokogiri( open( @index_uri, 'r:UTF-8', &:read ) )
14
- end
10
+ module Aozoragen
11
+ class Webmysteries
12
+ include Util
15
13
 
16
- def metainfo
17
- info = {:id => Pathname( @index_uri.path ).basename.sub( %r|.*?-(.*)$|, '\1' ).to_s}
18
- info[:title] = (@index_html / '#wiki-body h1')[0].text
19
- (@index_html / '#wiki-body h2 + ul li' ).each do |li|
20
- info[:author] = [li.text]
14
+ def initialize( index_uri )
15
+ @index_uri = URI( index_uri )
16
+ @index_html = Nokogiri( open( @index_uri, 'r:UTF-8', &:read ) )
21
17
  end
22
- info
23
- end
24
-
25
- def each_chapter
26
- (@index_html / '#wiki-body h3 + ul li a' ).each_with_index do |a, i|
27
- uri = URI( a.attr( 'href' ) )
28
- text = "\n"
29
- each_pages( uri ) do |page|
30
- text << page
18
+
19
+ def metainfo
20
+ info = {:id => Pathname( @index_uri.path ).basename.sub( %r|.*?-(.*)$|, '\1' ).to_s}
21
+ info[:title] = (@index_html / '#wiki-body h1')[0].text
22
+ (@index_html / '#wiki-body h2 + ul li' ).each do |li|
23
+ info[:author] = [li.text]
31
24
  end
32
- yield( {id: '%02d' % (i+1), uri: uri, text: text} )
25
+ info
33
26
  end
34
- end
35
-
36
- def each_pages( index )
37
- begin
38
- pages = []
39
- html = Nokogiri( open( index, 'r', &:read ) )
40
- (html / 'ul.pageNavi a').each do |a|
41
- pages << a.attr( 'href' )
27
+
28
+ def each_chapter
29
+ (@index_html / '#wiki-body h3 + ul li a' ).each_with_index do |a, i|
30
+ uri = URI( a.attr( 'href' ) )
31
+ text = "\n"
32
+ each_pages( uri ) do |page|
33
+ text << page
34
+ end
35
+ yield( {id: '%02d' % (i+1), uri: uri, text: text} )
42
36
  end
43
- pages.shift # delete current page
37
+ end
38
+
39
+ def each_pages( index )
44
40
  begin
45
- (html / 'noscript param[name="FlashVars"]')[0].attr( 'value' ).scan( /entry=(\d+)/ ) do |i|
46
- yield get_text( i[0] )
41
+ pages = []
42
+ html = Nokogiri( open( index, 'r', &:read ) )
43
+ (html / 'ul.pageNavi a').each do |a|
44
+ pages << a.attr( 'href' )
47
45
  end
48
- end while html = Nokogiri( open( pages.shift, 'r', &:read ) )
49
- rescue TypeError
50
- # ignore open nil
46
+ pages.shift # delete current page
47
+ begin
48
+ (html / 'noscript param[name="FlashVars"]')[0].attr( 'value' ).scan( /entry=(\d+)/ ) do |i|
49
+ yield get_text( i[0] )
50
+ end
51
+ end while html = Nokogiri( open( pages.shift, 'r', &:read ) )
52
+ rescue TypeError
53
+ # ignore open nil
54
+ end
51
55
  end
52
- end
53
-
54
- def get_text( xml_id )
55
- result = ''
56
- open( "http://www.webmysteries.jp/entry_xml_data/#{xml_id}.xml" ) do |fx|
57
- CGI::unescape( fx.read ).scan( %r|<entryBody>(.*?)</entryBody>|m ) do |entry|
58
- result << entry[0].gsub( %r|<.*?>|m, "" )
56
+
57
+ def get_text( xml_id )
58
+ result = ''
59
+ open( "http://www.webmysteries.jp/entry_xml_data/#{xml_id}.xml" ) do |fx|
60
+ CGI::unescape( fx.read ).scan( %r|<entryBody>(.*?)</entryBody>|m ) do |entry|
61
+ result << entry[0].gsub( %r|<.*?>|m, "" )
62
+ end
59
63
  end
64
+ result.
65
+ gsub( /^.*(つづく).*$/, '[#改ページ]' ).
66
+ gsub( /(?<=.)(([あ-ん]+))/, '《\1》' ).
67
+ gsub( /\n{3,}/m, "\n\n" ).
68
+ for_tategaki.
69
+ normalize_char
60
70
  end
61
- result.
62
- gsub( /^.*(つづく).*$/, '[#改ページ]' ).
63
- gsub( /(?<=.)(([あ-ん]+))/, '《\1》' ).
64
- gsub( /‐/, '─' ).
65
- gsub( /\uFF0D/, '─' ).
66
- gsub( /\n{3,}/m, "\n\n" )
67
71
  end
68
72
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aozoragen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-02-27 00:00:00.000000000 Z
12
+ date: 2012-02-29 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
16
- requirement: &81978730 !ruby/object:Gem::Requirement
16
+ requirement: &72469120 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *81978730
24
+ version_requirements: *72469120
25
25
  description: Scraping some Ebook web site and generating AOZORA format text files.
26
26
  email:
27
27
  - t@tdtds.jp
@@ -41,6 +41,7 @@ files:
41
41
  - lib/aozoragen.rb
42
42
  - lib/aozoragen/renzaburo.rb
43
43
  - lib/aozoragen/sai-zen-sen.rb
44
+ - lib/aozoragen/util.rb
44
45
  - lib/aozoragen/version.rb
45
46
  - lib/aozoragen/webmysteries.rb
46
47
  homepage: https://github.com/tdtds/aozoragen