aozoragen 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +6 -0
- data/Gemfile +4 -0
- data/Rakefile +1 -0
- data/aozoragen.gemspec +24 -0
- data/bin/aozora2pdf +20 -0
- data/bin/aozoragen +33 -0
- data/lib/aozoragen/sai-zen-sen.rb +64 -0
- data/lib/aozoragen/version.rb +3 -0
- data/lib/aozoragen.rb +5 -0
- metadata +67 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/aozoragen.gemspec
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "aozoragen/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "aozoragen"
|
7
|
+
s.version = Aozoragen::VERSION
|
8
|
+
s.authors = ["TADA Tadashi"]
|
9
|
+
s.email = ["t@tdtds.jp"]
|
10
|
+
s.homepage = ""
|
11
|
+
s.summary = "Generating AOZORA format text of eBook novels via some Web sites."
|
12
|
+
s.description = "Scraping some Ebook web site and generating AOZORA format text files."
|
13
|
+
|
14
|
+
s.rubyforge_project = "aozoragen"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
# specify any dependencies here; for example:
|
22
|
+
# s.add_development_dependency "rspec"
|
23
|
+
s.add_runtime_dependency "nokogiri"
|
24
|
+
end
|
data/bin/aozora2pdf
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8; -*-
|
3
|
+
#
|
4
|
+
# aozora2pdf: converting AOZORA format text to PDF using Aozora-Kindle servive.
|
5
|
+
# see: http://a2k.aill.org/
|
6
|
+
#
|
7
|
+
# sample usage:
|
8
|
+
# % aozora2pdf hoge*.txt > hoge.pdf
|
9
|
+
#
|
10
|
+
# Copyright (C) 2012 by TADA Tadashi <t@tdtds.jp>
|
11
|
+
# Distributed under GPL
|
12
|
+
#
|
13
|
+
require 'uri'
|
14
|
+
require 'net/http'
|
15
|
+
require 'cgi'
|
16
|
+
|
17
|
+
uri = URI( 'http://a2k.aill.org/download.cgi' )
|
18
|
+
text = ARGF.read.force_encoding( 'UTF-8' )
|
19
|
+
res = Net::HTTP.post_form( uri, 's' => 's', 'text' => text )
|
20
|
+
print res.body
|
data/bin/aozoragen
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8; -*-
|
3
|
+
#
|
4
|
+
# aozoragen: generating AOZORA format text files from scraping some ebook sites.
|
5
|
+
#
|
6
|
+
# Copyright (C) 2012 by TADA Tadashi <t@tdtds.jp>
|
7
|
+
# Distributed under GPL
|
8
|
+
#
|
9
|
+
require 'uri'
|
10
|
+
|
11
|
+
ARGV.each do |u|
|
12
|
+
uri = URI( u )
|
13
|
+
book = nil
|
14
|
+
case uri.host
|
15
|
+
when 'sai-zen-sen.jp'
|
16
|
+
require 'aozoragen/sai-zen-sen'
|
17
|
+
book = SaiZenSen::new( uri )
|
18
|
+
else
|
19
|
+
p "Error: unknown URI: #{uri}"
|
20
|
+
end
|
21
|
+
next unless book
|
22
|
+
|
23
|
+
meta = book.metainfo
|
24
|
+
open( "#{meta[:id]}.00.txt", 'w' ) do |w|
|
25
|
+
w.puts "#{meta[:title]}\n#{meta[:author].join ' / '}\n\n\n[#改ページ]"
|
26
|
+
end
|
27
|
+
|
28
|
+
book.each_chapter do |chapter|
|
29
|
+
open( "#{meta[:id]}.#{chapter[:id]}.txt", 'w' ) do |w|
|
30
|
+
w.puts chapter[:text]
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# -*- coding: utf-8; -*-
|
2
|
+
#
|
3
|
+
# scraping sai-zen-sen.jp
|
4
|
+
#
|
5
|
+
require 'nokogiri'
|
6
|
+
require 'open-uri'
|
7
|
+
require 'pathname'
|
8
|
+
|
9
|
+
class SaiZenSen
|
10
|
+
def initialize( index_uri )
|
11
|
+
@index_uri = index_uri
|
12
|
+
@index_html = Nokogiri( open( @index_uri, 'r:utf-8', &:read ) )
|
13
|
+
end
|
14
|
+
|
15
|
+
def metainfo
|
16
|
+
info = {:id => Pathname( @index_uri.path ).basename.to_s}
|
17
|
+
info[:title] = (@index_html / '#page-content-heading h1')[0].text
|
18
|
+
(@index_html / '#authors h3').each do |author|
|
19
|
+
info[:author] = (info[:author] || [] ) << author.text.sub( /.*? /, '' )
|
20
|
+
end
|
21
|
+
info
|
22
|
+
end
|
23
|
+
|
24
|
+
def each_chapter
|
25
|
+
(@index_html / '#back-numbers li a').each do |a|
|
26
|
+
uri = @index_uri + a.attr('href')
|
27
|
+
chapter = Nokogiri( open( uri, 'r:utf-8', &:read ) )
|
28
|
+
text = ''
|
29
|
+
(chapter / 'section.book-page-spread').each do |page|
|
30
|
+
page./( 'hgroup', 'div.pgroup' ).each do |section|
|
31
|
+
case section.name
|
32
|
+
when 'hgroup'
|
33
|
+
text << "\n #{detag section}\n\n"
|
34
|
+
when 'div'
|
35
|
+
if section.attr( 'class' ) =~ /delimiter/
|
36
|
+
text << ' ' * 5 << '─' * 10 << "\n"
|
37
|
+
else
|
38
|
+
section.children.each do |paragraph|
|
39
|
+
next unless paragraph.name == 'p'
|
40
|
+
text << " #{detag paragraph}\n"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
text << "\n"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
text << "[#改ページ]\n"
|
47
|
+
end
|
48
|
+
|
49
|
+
yield( {id: Pathname( uri.path ).dirname.basename.to_s, uri: uri, text: text} )
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def detag( elem )
|
54
|
+
(elem / 'ruby rp').each do |rp|
|
55
|
+
case rp.text
|
56
|
+
when '('
|
57
|
+
rp.inner_html = '《'
|
58
|
+
when ')'
|
59
|
+
rp.inner_html = '》'
|
60
|
+
end
|
61
|
+
end
|
62
|
+
elem.to_html.gsub( /<.*?>/, '' ).strip
|
63
|
+
end
|
64
|
+
end
|
data/lib/aozoragen.rb
ADDED
metadata
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: aozoragen
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- TADA Tadashi
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-02-23 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: &71959020 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *71959020
|
25
|
+
description: Scraping some Ebook web site and generating AOZORA format text files.
|
26
|
+
email:
|
27
|
+
- t@tdtds.jp
|
28
|
+
executables:
|
29
|
+
- aozora2pdf
|
30
|
+
- aozoragen
|
31
|
+
extensions: []
|
32
|
+
extra_rdoc_files: []
|
33
|
+
files:
|
34
|
+
- .gitignore
|
35
|
+
- Gemfile
|
36
|
+
- Rakefile
|
37
|
+
- aozoragen.gemspec
|
38
|
+
- bin/aozora2pdf
|
39
|
+
- bin/aozoragen
|
40
|
+
- lib/aozoragen.rb
|
41
|
+
- lib/aozoragen/sai-zen-sen.rb
|
42
|
+
- lib/aozoragen/version.rb
|
43
|
+
homepage: ''
|
44
|
+
licenses: []
|
45
|
+
post_install_message:
|
46
|
+
rdoc_options: []
|
47
|
+
require_paths:
|
48
|
+
- lib
|
49
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
56
|
+
none: false
|
57
|
+
requirements:
|
58
|
+
- - ! '>='
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '0'
|
61
|
+
requirements: []
|
62
|
+
rubyforge_project: aozoragen
|
63
|
+
rubygems_version: 1.8.11
|
64
|
+
signing_key:
|
65
|
+
specification_version: 3
|
66
|
+
summary: Generating AOZORA format text of eBook novels via some Web sites.
|
67
|
+
test_files: []
|