aozoragen 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +6 -0
- data/Gemfile +4 -0
- data/Rakefile +1 -0
- data/aozoragen.gemspec +24 -0
- data/bin/aozora2pdf +20 -0
- data/bin/aozoragen +33 -0
- data/lib/aozoragen/sai-zen-sen.rb +64 -0
- data/lib/aozoragen/version.rb +3 -0
- data/lib/aozoragen.rb +5 -0
- metadata +67 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/aozoragen.gemspec
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "aozoragen/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "aozoragen"
|
7
|
+
s.version = Aozoragen::VERSION
|
8
|
+
s.authors = ["TADA Tadashi"]
|
9
|
+
s.email = ["t@tdtds.jp"]
|
10
|
+
s.homepage = ""
|
11
|
+
s.summary = "Generating AOZORA format text of eBook novels via some Web sites."
|
12
|
+
s.description = "Scraping some Ebook web site and generating AOZORA format text files."
|
13
|
+
|
14
|
+
s.rubyforge_project = "aozoragen"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
# specify any dependencies here; for example:
|
22
|
+
# s.add_development_dependency "rspec"
|
23
|
+
s.add_runtime_dependency "nokogiri"
|
24
|
+
end
|
data/bin/aozora2pdf
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8; -*-
|
3
|
+
#
|
4
|
+
# aozora2pdf: converting AOZORA format text to PDF using Aozora-Kindle servive.
|
5
|
+
# see: http://a2k.aill.org/
|
6
|
+
#
|
7
|
+
# sample usage:
|
8
|
+
# % aozora2pdf hoge*.txt > hoge.pdf
|
9
|
+
#
|
10
|
+
# Copyright (C) 2012 by TADA Tadashi <t@tdtds.jp>
|
11
|
+
# Distributed under GPL
|
12
|
+
#
|
13
|
+
require 'uri'
|
14
|
+
require 'net/http'
|
15
|
+
require 'cgi'
|
16
|
+
|
17
|
+
uri = URI( 'http://a2k.aill.org/download.cgi' )
|
18
|
+
text = ARGF.read.force_encoding( 'UTF-8' )
|
19
|
+
res = Net::HTTP.post_form( uri, 's' => 's', 'text' => text )
|
20
|
+
print res.body
|
data/bin/aozoragen
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8; -*-
|
3
|
+
#
|
4
|
+
# aozoragen: generating AOZORA format text files from scraping some ebook sites.
|
5
|
+
#
|
6
|
+
# Copyright (C) 2012 by TADA Tadashi <t@tdtds.jp>
|
7
|
+
# Distributed under GPL
|
8
|
+
#
|
9
|
+
require 'uri'
|
10
|
+
|
11
|
+
ARGV.each do |u|
|
12
|
+
uri = URI( u )
|
13
|
+
book = nil
|
14
|
+
case uri.host
|
15
|
+
when 'sai-zen-sen.jp'
|
16
|
+
require 'aozoragen/sai-zen-sen'
|
17
|
+
book = SaiZenSen::new( uri )
|
18
|
+
else
|
19
|
+
p "Error: unknown URI: #{uri}"
|
20
|
+
end
|
21
|
+
next unless book
|
22
|
+
|
23
|
+
meta = book.metainfo
|
24
|
+
open( "#{meta[:id]}.00.txt", 'w' ) do |w|
|
25
|
+
w.puts "#{meta[:title]}\n#{meta[:author].join ' / '}\n\n\n[#改ページ]"
|
26
|
+
end
|
27
|
+
|
28
|
+
book.each_chapter do |chapter|
|
29
|
+
open( "#{meta[:id]}.#{chapter[:id]}.txt", 'w' ) do |w|
|
30
|
+
w.puts chapter[:text]
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# -*- coding: utf-8; -*-
|
2
|
+
#
|
3
|
+
# scraping sai-zen-sen.jp
|
4
|
+
#
|
5
|
+
require 'nokogiri'
|
6
|
+
require 'open-uri'
|
7
|
+
require 'pathname'
|
8
|
+
|
9
|
+
class SaiZenSen
|
10
|
+
def initialize( index_uri )
|
11
|
+
@index_uri = index_uri
|
12
|
+
@index_html = Nokogiri( open( @index_uri, 'r:utf-8', &:read ) )
|
13
|
+
end
|
14
|
+
|
15
|
+
def metainfo
|
16
|
+
info = {:id => Pathname( @index_uri.path ).basename.to_s}
|
17
|
+
info[:title] = (@index_html / '#page-content-heading h1')[0].text
|
18
|
+
(@index_html / '#authors h3').each do |author|
|
19
|
+
info[:author] = (info[:author] || [] ) << author.text.sub( /.*? /, '' )
|
20
|
+
end
|
21
|
+
info
|
22
|
+
end
|
23
|
+
|
24
|
+
def each_chapter
|
25
|
+
(@index_html / '#back-numbers li a').each do |a|
|
26
|
+
uri = @index_uri + a.attr('href')
|
27
|
+
chapter = Nokogiri( open( uri, 'r:utf-8', &:read ) )
|
28
|
+
text = ''
|
29
|
+
(chapter / 'section.book-page-spread').each do |page|
|
30
|
+
page./( 'hgroup', 'div.pgroup' ).each do |section|
|
31
|
+
case section.name
|
32
|
+
when 'hgroup'
|
33
|
+
text << "\n #{detag section}\n\n"
|
34
|
+
when 'div'
|
35
|
+
if section.attr( 'class' ) =~ /delimiter/
|
36
|
+
text << ' ' * 5 << '─' * 10 << "\n"
|
37
|
+
else
|
38
|
+
section.children.each do |paragraph|
|
39
|
+
next unless paragraph.name == 'p'
|
40
|
+
text << " #{detag paragraph}\n"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
text << "\n"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
text << "[#改ページ]\n"
|
47
|
+
end
|
48
|
+
|
49
|
+
yield( {id: Pathname( uri.path ).dirname.basename.to_s, uri: uri, text: text} )
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def detag( elem )
|
54
|
+
(elem / 'ruby rp').each do |rp|
|
55
|
+
case rp.text
|
56
|
+
when '('
|
57
|
+
rp.inner_html = '《'
|
58
|
+
when ')'
|
59
|
+
rp.inner_html = '》'
|
60
|
+
end
|
61
|
+
end
|
62
|
+
elem.to_html.gsub( /<.*?>/, '' ).strip
|
63
|
+
end
|
64
|
+
end
|
data/lib/aozoragen.rb
ADDED
metadata
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: aozoragen
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- TADA Tadashi
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-02-23 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: &71959020 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *71959020
|
25
|
+
description: Scraping some Ebook web site and generating AOZORA format text files.
|
26
|
+
email:
|
27
|
+
- t@tdtds.jp
|
28
|
+
executables:
|
29
|
+
- aozora2pdf
|
30
|
+
- aozoragen
|
31
|
+
extensions: []
|
32
|
+
extra_rdoc_files: []
|
33
|
+
files:
|
34
|
+
- .gitignore
|
35
|
+
- Gemfile
|
36
|
+
- Rakefile
|
37
|
+
- aozoragen.gemspec
|
38
|
+
- bin/aozora2pdf
|
39
|
+
- bin/aozoragen
|
40
|
+
- lib/aozoragen.rb
|
41
|
+
- lib/aozoragen/sai-zen-sen.rb
|
42
|
+
- lib/aozoragen/version.rb
|
43
|
+
homepage: ''
|
44
|
+
licenses: []
|
45
|
+
post_install_message:
|
46
|
+
rdoc_options: []
|
47
|
+
require_paths:
|
48
|
+
- lib
|
49
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
56
|
+
none: false
|
57
|
+
requirements:
|
58
|
+
- - ! '>='
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '0'
|
61
|
+
requirements: []
|
62
|
+
rubyforge_project: aozoragen
|
63
|
+
rubygems_version: 1.8.11
|
64
|
+
signing_key:
|
65
|
+
specification_version: 3
|
66
|
+
summary: Generating AOZORA format text of eBook novels via some Web sites.
|
67
|
+
test_files: []
|