colread 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/colread +7 -8
- data/lib/colread.rb +11 -4
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: eaa58fe8f40e78e6e9e838c227c863bea2fe349f
|
4
|
+
data.tar.gz: 6d25ee9b43562df83027b0beaf301f99f9643b26
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e97b37fac6351f7400ed4e0d267f97ba556d0459c407e55ef68e01f55da38928cc44a5b6bcdb3a01d133c38c2b441999664ea786272cec524df566485cc432ab
|
7
|
+
data.tar.gz: 19ebb47c9bdf28e017d880de748123026ffaf71cdb8d068794e0a2792db7e2e3963c0ac93bfccec5a8e811f62e828f45360edf603ec4555fd70da800e748d92d
|
data/bin/colread
CHANGED
@@ -1,16 +1,15 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
require 'colread'
|
3
|
+
require File.dirname(__FILE__) + '/../lib/colread'
|
4
4
|
require 'rubygems'
|
5
5
|
require 'optparse'
|
6
|
-
#require File.dirname(__FILE__) + '/../lib/readability'
|
7
6
|
|
8
|
-
options = { :
|
7
|
+
options = { :encode => 'GBK' }
|
9
8
|
options_parser = OptionParser.new do |opts|
|
10
|
-
opts.banner = "Usage: #{File.basename($0)} [options]
|
9
|
+
opts.banner = "Usage: #{File.basename($0)} [options] Url FileName"
|
11
10
|
|
12
|
-
opts.on("-
|
13
|
-
options[:
|
11
|
+
opts.on("-e encode", "--encode=encode", "page encode") do |e|
|
12
|
+
options[:encode] = e
|
14
13
|
end
|
15
14
|
|
16
15
|
opts.on_tail("-h", "--help", "Show this message") do
|
@@ -20,9 +19,9 @@ options_parser = OptionParser.new do |opts|
|
|
20
19
|
end
|
21
20
|
options_parser.parse!
|
22
21
|
|
23
|
-
if ARGV.length
|
22
|
+
if ARGV.length < 2
|
24
23
|
STDERR.puts options_parser
|
25
24
|
exit 1
|
26
25
|
end
|
27
26
|
|
28
|
-
ColRead::Text.new(url: ARGV.first, output: ARGV.last).start
|
27
|
+
ColRead::Text.new(url: ARGV.first, output: ARGV.last, encode: options[:encode]).start
|
data/lib/colread.rb
CHANGED
@@ -15,8 +15,8 @@ end
|
|
15
15
|
module ColRead
|
16
16
|
module Core
|
17
17
|
def chapters url
|
18
|
-
doc = Nokogiri::HTML(
|
19
|
-
doc.css('a').group_by{|a| a.indent}.sort_by{|a| a.last.count}.last.last
|
18
|
+
doc = Nokogiri::HTML(_open(url))
|
19
|
+
doc.css('a').select{|a| a.text =~ /\S/ }.group_by{|a| a.indent}.sort_by{|a| a.last.count}.last.last
|
20
20
|
end
|
21
21
|
|
22
22
|
def ahref a
|
@@ -31,18 +31,25 @@ module ColRead
|
|
31
31
|
|
32
32
|
def contents chapters
|
33
33
|
chapters.each do |a|
|
34
|
-
source =
|
34
|
+
source = _open(ahref(a))
|
35
35
|
content=Nokogiri::HTML(Readability::Document.new(source).content).text
|
36
36
|
yield [a.text, content]
|
37
37
|
end
|
38
38
|
end
|
39
|
+
|
40
|
+
def _open url
|
41
|
+
source = open(url).read
|
42
|
+
source.force_encoding(@encode)
|
43
|
+
source.encode!("utf-8", :undef => :replace, :replace => "?", :invalid => :replace)
|
44
|
+
end
|
39
45
|
end
|
40
46
|
|
41
47
|
class Text
|
42
48
|
include ColRead::Core
|
43
|
-
def initialize(options={url: '',output: ''})
|
49
|
+
def initialize(options={url: '',output: '', encode: ''})
|
44
50
|
@url=options[:url]
|
45
51
|
@output=options[:output]
|
52
|
+
@encode=options[:encode]
|
46
53
|
@root=@url.split(/(?<!\/)\/(?!\/)/).first
|
47
54
|
end
|
48
55
|
def start
|