colread 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/colread +7 -8
- data/lib/colread.rb +11 -4
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: eaa58fe8f40e78e6e9e838c227c863bea2fe349f
|
4
|
+
data.tar.gz: 6d25ee9b43562df83027b0beaf301f99f9643b26
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e97b37fac6351f7400ed4e0d267f97ba556d0459c407e55ef68e01f55da38928cc44a5b6bcdb3a01d133c38c2b441999664ea786272cec524df566485cc432ab
|
7
|
+
data.tar.gz: 19ebb47c9bdf28e017d880de748123026ffaf71cdb8d068794e0a2792db7e2e3963c0ac93bfccec5a8e811f62e828f45360edf603ec4555fd70da800e748d92d
|
data/bin/colread
CHANGED
@@ -1,16 +1,15 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
require 'colread'
|
3
|
+
require File.dirname(__FILE__) + '/../lib/colread'
|
4
4
|
require 'rubygems'
|
5
5
|
require 'optparse'
|
6
|
-
#require File.dirname(__FILE__) + '/../lib/readability'
|
7
6
|
|
8
|
-
options = { :
|
7
|
+
options = { :encode => 'GBK' }
|
9
8
|
options_parser = OptionParser.new do |opts|
|
10
|
-
opts.banner = "Usage: #{File.basename($0)} [options]
|
9
|
+
opts.banner = "Usage: #{File.basename($0)} [options] Url FileName"
|
11
10
|
|
12
|
-
opts.on("-
|
13
|
-
options[:
|
11
|
+
opts.on("-e encode", "--encode=encode", "page encode") do |e|
|
12
|
+
options[:encode] = e
|
14
13
|
end
|
15
14
|
|
16
15
|
opts.on_tail("-h", "--help", "Show this message") do
|
@@ -20,9 +19,9 @@ options_parser = OptionParser.new do |opts|
|
|
20
19
|
end
|
21
20
|
options_parser.parse!
|
22
21
|
|
23
|
-
if ARGV.length
|
22
|
+
if ARGV.length < 2
|
24
23
|
STDERR.puts options_parser
|
25
24
|
exit 1
|
26
25
|
end
|
27
26
|
|
28
|
-
ColRead::Text.new(url: ARGV.first, output: ARGV.last).start
|
27
|
+
ColRead::Text.new(url: ARGV.first, output: ARGV.last, encode: options[:encode]).start
|
data/lib/colread.rb
CHANGED
@@ -15,8 +15,8 @@ end
|
|
15
15
|
module ColRead
|
16
16
|
module Core
|
17
17
|
def chapters url
|
18
|
-
doc = Nokogiri::HTML(
|
19
|
-
doc.css('a').group_by{|a| a.indent}.sort_by{|a| a.last.count}.last.last
|
18
|
+
doc = Nokogiri::HTML(_open(url))
|
19
|
+
doc.css('a').select{|a| a.text =~ /\S/ }.group_by{|a| a.indent}.sort_by{|a| a.last.count}.last.last
|
20
20
|
end
|
21
21
|
|
22
22
|
def ahref a
|
@@ -31,18 +31,25 @@ module ColRead
|
|
31
31
|
|
32
32
|
def contents chapters
|
33
33
|
chapters.each do |a|
|
34
|
-
source =
|
34
|
+
source = _open(ahref(a))
|
35
35
|
content=Nokogiri::HTML(Readability::Document.new(source).content).text
|
36
36
|
yield [a.text, content]
|
37
37
|
end
|
38
38
|
end
|
39
|
+
|
40
|
+
def _open url
|
41
|
+
source = open(url).read
|
42
|
+
source.force_encoding(@encode)
|
43
|
+
source.encode!("utf-8", :undef => :replace, :replace => "?", :invalid => :replace)
|
44
|
+
end
|
39
45
|
end
|
40
46
|
|
41
47
|
class Text
|
42
48
|
include ColRead::Core
|
43
|
-
def initialize(options={url: '',output: ''})
|
49
|
+
def initialize(options={url: '',output: '', encode: ''})
|
44
50
|
@url=options[:url]
|
45
51
|
@output=options[:output]
|
52
|
+
@encode=options[:encode]
|
46
53
|
@root=@url.split(/(?<!\/)\/(?!\/)/).first
|
47
54
|
end
|
48
55
|
def start
|