aozoragen 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/aozoragen CHANGED
@@ -15,6 +15,9 @@ ARGV.each do |u|
15
15
  when 'sai-zen-sen.jp'
16
16
  require 'aozoragen/sai-zen-sen'
17
17
  book = SaiZenSen::new( uri )
18
+ when 'renzaburo.jp'
19
+ require 'aozoragen/renzaburo'
20
+ book = Renzaburo::new( uri )
18
21
  else
19
22
  p "Error: unknown URI: #{uri}"
20
23
  end
@@ -0,0 +1,52 @@
1
+ # -*- coding: utf-8; -*-
2
+ #
3
+ # scraping renzaburo.jp
4
+ #
5
+ require 'nokogiri'
6
+ require 'open-uri'
7
+ require 'pathname'
8
+
9
+ class Renzaburo
10
+ def initialize( index_uri )
11
+ @index_uri = URI( index_uri.to_s.sub( /index\.html$/, '' ) )
12
+ @index_html = Nokogiri( open( @index_uri, 'r:CP932', &:read ) )
13
+ end
14
+
15
+ def metainfo
16
+ info = {:id => Pathname( @index_uri.path ).basename.to_s}
17
+ (@index_html / 'title').each do |t|
18
+ info[:title] = t.text.sub( /|.*/, '' )
19
+ end
20
+ (@index_html / 'div.textBlock strong' ).each do |st|
21
+ info[:author] = [st.text]
22
+ end
23
+ info
24
+ end
25
+
26
+ def each_chapter
27
+ book_title = metainfo[:title]
28
+ (@index_html / 'ul.btnList li.withDate a' ).each do |a|
29
+ uri = @index_uri + a.attr( :href )
30
+ text = get_content( uri, book_title )
31
+ yield( {id: Pathname( uri.path ).dirname.basename.to_s, uri: uri, text: text} )
32
+ end
33
+ end
34
+
35
+ def get_content( uri, book_title = '' )
36
+ text = ''
37
+ html = open( uri, 'r:CP932', &:read ).encode( 'UTF-8' )
38
+ html = html.gsub( /\—/, "\u2500" ).gsub( /\"/, "\u201D" )
39
+ (Nokogiri( html ) / 'div#mainContent' ).each do |content|
40
+ (content / 'h3').each do |t|
41
+ title = t.text.sub( /^『#{book_title}』 /, '' )
42
+ text << "\n     #{title}\n\n"
43
+ end
44
+ (content / 'div.textBlock p' ).each do |para|
45
+ next if /<次回につづく>/ =~ para.text
46
+ text << ' ' * 10 if (para.attr('class') || '').index( 'txtAlignC' )
47
+ text << para.text.gsub( /<br>/, "\n" ) << "\n\n"
48
+ end
49
+ end
50
+ text << "[#改ページ]\n"
51
+ end
52
+ end
@@ -1,3 +1,3 @@
1
1
  module Aozoragen
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aozoragen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-02-23 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
16
- requirement: &71959020 !ruby/object:Gem::Requirement
16
+ requirement: &78022280 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *71959020
24
+ version_requirements: *78022280
25
25
  description: Scraping some Ebook web site and generating AOZORA format text files.
26
26
  email:
27
27
  - t@tdtds.jp
@@ -38,6 +38,7 @@ files:
38
38
  - bin/aozora2pdf
39
39
  - bin/aozoragen
40
40
  - lib/aozoragen.rb
41
+ - lib/aozoragen/renzaburo.rb
41
42
  - lib/aozoragen/sai-zen-sen.rb
42
43
  - lib/aozoragen/version.rb
43
44
  homepage: ''