aozoragen 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/aozoragen +3 -0
- data/lib/aozoragen/renzaburo.rb +52 -0
- data/lib/aozoragen/version.rb +1 -1
- metadata +4 -3
data/bin/aozoragen
CHANGED
@@ -0,0 +1,52 @@
|
|
1
|
+
# -*- coding: utf-8; -*-
|
2
|
+
#
|
3
|
+
# scraping renzaburo.jp
|
4
|
+
#
|
5
|
+
require 'nokogiri'
|
6
|
+
require 'open-uri'
|
7
|
+
require 'pathname'
|
8
|
+
|
9
|
+
class Renzaburo
|
10
|
+
def initialize( index_uri )
|
11
|
+
@index_uri = URI( index_uri.to_s.sub( /index\.html$/, '' ) )
|
12
|
+
@index_html = Nokogiri( open( @index_uri, 'r:CP932', &:read ) )
|
13
|
+
end
|
14
|
+
|
15
|
+
def metainfo
|
16
|
+
info = {:id => Pathname( @index_uri.path ).basename.to_s}
|
17
|
+
(@index_html / 'title').each do |t|
|
18
|
+
info[:title] = t.text.sub( /|.*/, '' )
|
19
|
+
end
|
20
|
+
(@index_html / 'div.textBlock strong' ).each do |st|
|
21
|
+
info[:author] = [st.text]
|
22
|
+
end
|
23
|
+
info
|
24
|
+
end
|
25
|
+
|
26
|
+
def each_chapter
|
27
|
+
book_title = metainfo[:title]
|
28
|
+
(@index_html / 'ul.btnList li.withDate a' ).each do |a|
|
29
|
+
uri = @index_uri + a.attr( :href )
|
30
|
+
text = get_content( uri, book_title )
|
31
|
+
yield( {id: Pathname( uri.path ).dirname.basename.to_s, uri: uri, text: text} )
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def get_content( uri, book_title = '' )
|
36
|
+
text = ''
|
37
|
+
html = open( uri, 'r:CP932', &:read ).encode( 'UTF-8' )
|
38
|
+
html = html.gsub( /\—/, "\u2500" ).gsub( /\"/, "\u201D" )
|
39
|
+
(Nokogiri( html ) / 'div#mainContent' ).each do |content|
|
40
|
+
(content / 'h3').each do |t|
|
41
|
+
title = t.text.sub( /^『#{book_title}』 /, '' )
|
42
|
+
text << "\n #{title}\n\n"
|
43
|
+
end
|
44
|
+
(content / 'div.textBlock p' ).each do |para|
|
45
|
+
next if /<次回につづく>/ =~ para.text
|
46
|
+
text << ' ' * 10 if (para.attr('class') || '').index( 'txtAlignC' )
|
47
|
+
text << para.text.gsub( /<br>/, "\n" ) << "\n\n"
|
48
|
+
end
|
49
|
+
end
|
50
|
+
text << "[#改ページ]\n"
|
51
|
+
end
|
52
|
+
end
|
data/lib/aozoragen/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: aozoragen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2012-02-23 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
16
|
-
requirement: &
|
16
|
+
requirement: &78022280 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *78022280
|
25
25
|
description: Scraping some Ebook web site and generating AOZORA format text files.
|
26
26
|
email:
|
27
27
|
- t@tdtds.jp
|
@@ -38,6 +38,7 @@ files:
|
|
38
38
|
- bin/aozora2pdf
|
39
39
|
- bin/aozoragen
|
40
40
|
- lib/aozoragen.rb
|
41
|
+
- lib/aozoragen/renzaburo.rb
|
41
42
|
- lib/aozoragen/sai-zen-sen.rb
|
42
43
|
- lib/aozoragen/version.rb
|
43
44
|
homepage: ''
|