aozoragen 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/bin/aozoragen CHANGED
@@ -15,6 +15,9 @@ ARGV.each do |u|
15
15
  when 'sai-zen-sen.jp'
16
16
  require 'aozoragen/sai-zen-sen'
17
17
  book = SaiZenSen::new( uri )
18
+ when 'renzaburo.jp'
19
+ require 'aozoragen/renzaburo'
20
+ book = Renzaburo::new( uri )
18
21
  else
19
22
  p "Error: unknown URI: #{uri}"
20
23
  end
@@ -0,0 +1,52 @@
1
+ # -*- coding: utf-8; -*-
2
+ #
3
+ # scraping renzaburo.jp
4
+ #
5
+ require 'nokogiri'
6
+ require 'open-uri'
7
+ require 'pathname'
8
+
9
+ class Renzaburo
10
+ def initialize( index_uri )
11
+ @index_uri = URI( index_uri.to_s.sub( /index\.html$/, '' ) )
12
+ @index_html = Nokogiri( open( @index_uri, 'r:CP932', &:read ) )
13
+ end
14
+
15
+ def metainfo
16
+ info = {:id => Pathname( @index_uri.path ).basename.to_s}
17
+ (@index_html / 'title').each do |t|
18
+ info[:title] = t.text.sub( /|.*/, '' )
19
+ end
20
+ (@index_html / 'div.textBlock strong' ).each do |st|
21
+ info[:author] = [st.text]
22
+ end
23
+ info
24
+ end
25
+
26
+ def each_chapter
27
+ book_title = metainfo[:title]
28
+ (@index_html / 'ul.btnList li.withDate a' ).each do |a|
29
+ uri = @index_uri + a.attr( :href )
30
+ text = get_content( uri, book_title )
31
+ yield( {id: Pathname( uri.path ).dirname.basename.to_s, uri: uri, text: text} )
32
+ end
33
+ end
34
+
35
+ def get_content( uri, book_title = '' )
36
+ text = ''
37
+ html = open( uri, 'r:CP932', &:read ).encode( 'UTF-8' )
38
+ html = html.gsub( /\—/, "\u2500" ).gsub( /\"/, "\u201D" )
39
+ (Nokogiri( html ) / 'div#mainContent' ).each do |content|
40
+ (content / 'h3').each do |t|
41
+ title = t.text.sub( /^『#{book_title}』 /, '' )
42
+ text << "\n     #{title}\n\n"
43
+ end
44
+ (content / 'div.textBlock p' ).each do |para|
45
+ next if /<次回につづく>/ =~ para.text
46
+ text << ' ' * 10 if (para.attr('class') || '').index( 'txtAlignC' )
47
+ text << para.text.gsub( /<br>/, "\n" ) << "\n\n"
48
+ end
49
+ end
50
+ text << "[#改ページ]\n"
51
+ end
52
+ end
@@ -1,3 +1,3 @@
1
1
  module Aozoragen
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aozoragen
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-02-23 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
16
- requirement: &71959020 !ruby/object:Gem::Requirement
16
+ requirement: &78022280 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *71959020
24
+ version_requirements: *78022280
25
25
  description: Scraping some Ebook web site and generating AOZORA format text files.
26
26
  email:
27
27
  - t@tdtds.jp
@@ -38,6 +38,7 @@ files:
38
38
  - bin/aozora2pdf
39
39
  - bin/aozoragen
40
40
  - lib/aozoragen.rb
41
+ - lib/aozoragen/renzaburo.rb
41
42
  - lib/aozoragen/sai-zen-sen.rb
42
43
  - lib/aozoragen/version.rb
43
44
  homepage: ''