sinew 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +6 -0
- data/Gemfile +2 -0
- data/LICENSE +20 -0
- data/README.md +34 -0
- data/Rakefile +30 -0
- data/bin/sinew +16 -0
- data/lib/sinew.rb +6 -0
- data/lib/sinew/curler.rb +173 -0
- data/lib/sinew/main.rb +172 -0
- data/lib/sinew/nokogiri_ext.rb +27 -0
- data/lib/sinew/text_util.rb +103 -0
- data/lib/sinew/util.rb +236 -0
- data/lib/sinew/version.rb +4 -0
- data/sample.sinew +8 -0
- data/sinew.gemspec +29 -0
- metadata +179 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2012 Adam Doppelt
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
## Welcome to Sinew
|
2
|
+
|
3
|
+
Sinew collects structured data from web sites (screen scraping). It provides a Ruby DSL built for crawling, a robust caching system, and integration with [Nokogiri](http://nokogiri.org). Though small, this project is the culmination of years of effort based on crawling systems built at several different companies.
|
4
|
+
|
5
|
+
Sinew requires Ruby 1.9, [HTML Tidy](http://tidy.sourceforge.net) and [Curl](http://curl.haxx.se).
|
6
|
+
|
7
|
+
## Example
|
8
|
+
|
9
|
+
Here's an example for collecting Amazon's bestseller list:
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
# get the url
|
13
|
+
get "http://www.amazon.com/gp/bestsellers/books/ref=sv_b_3"
|
14
|
+
|
15
|
+
# use nokogiri to find books
|
16
|
+
noko.css(".zg_itemRow").each do |item|
|
17
|
+
# pull out the stuff we care about using nokogiri
|
18
|
+
row = { }
|
19
|
+
row[:url] = item.css(".zg_title a").first[:href]
|
20
|
+
row[:title] = item.css(".zg_title")
|
21
|
+
row[:img] = item.css(".zg_itemImage_normal img").first[:src]
|
22
|
+
|
23
|
+
# append a row to the csv
|
24
|
+
csv_emit(row)
|
25
|
+
end
|
26
|
+
```
|
27
|
+
|
28
|
+
If you paste this into a file called `bestsellers.sinew` and run `sinew bestsellers.sinew`, it will create a `bestsellers.csv` file containing the url, title and img for each bestseller.
|
29
|
+
|
30
|
+
## Full Documentation
|
31
|
+
|
32
|
+
Full docs are in the wiki:
|
33
|
+
|
34
|
+
https://github.com/gurgeous/sinew/wiki
|
data/Rakefile
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
require "bundler"
|
2
|
+
require "bundler/setup"
|
3
|
+
require "rake"
|
4
|
+
|
5
|
+
$LOAD_PATH << File.expand_path("../lib", __FILE__)
|
6
|
+
require "sinew/version"
|
7
|
+
|
8
|
+
#
|
9
|
+
# gem
|
10
|
+
#
|
11
|
+
|
12
|
+
task :gem => :build
|
13
|
+
task :build do
|
14
|
+
system "gem build --quiet sinew.gemspec"
|
15
|
+
end
|
16
|
+
|
17
|
+
task :install => :build do
|
18
|
+
system "sudo gem install --quiet sinew-#{Sinew::VERSION}.gem"
|
19
|
+
end
|
20
|
+
|
21
|
+
task :release => :build do
|
22
|
+
system "git tag -a #{Sinew::VERSION} -m 'Tagging #{Sinew::VERSION}'"
|
23
|
+
system "git push --tags"
|
24
|
+
system "gem push sinew-#{Sinew::VERSION}.gem"
|
25
|
+
end
|
26
|
+
|
27
|
+
task :default => :gem
|
28
|
+
|
29
|
+
# to test:
|
30
|
+
# block ; rake install && rm -rf ~/.sinew/www.amazon.com && /usr/local/bin/sinew sample.sinew
|
data/bin/sinew
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "sinew"
|
4
|
+
require "trollop"
|
5
|
+
|
6
|
+
# ARGV
|
7
|
+
options = Trollop.options do
|
8
|
+
banner "Usage: sinew [options] <gub.sinew>"
|
9
|
+
opt :verbose, "Dump every row"
|
10
|
+
end
|
11
|
+
Trollop.die "need a .sinew file to run against" if ARGV.blank?
|
12
|
+
|
13
|
+
# now run!
|
14
|
+
ARGV.each do |i|
|
15
|
+
Sinew::Main.new(options.merge(file: i))
|
16
|
+
end
|
data/lib/sinew.rb
ADDED
data/lib/sinew/curler.rb
ADDED
@@ -0,0 +1,173 @@
|
|
1
|
+
require "uri"
|
2
|
+
|
3
|
+
module Sinew
|
4
|
+
class Curler
|
5
|
+
class Error < StandardError ; end
|
6
|
+
|
7
|
+
DEFAULT_OPTIONS = {
|
8
|
+
:cache_errors => true,
|
9
|
+
:max_time => 30,
|
10
|
+
:retry => 3,
|
11
|
+
:verbose => true,
|
12
|
+
}
|
13
|
+
|
14
|
+
attr_reader :url, :uri, :root
|
15
|
+
|
16
|
+
def initialize(options = {})
|
17
|
+
@options = DEFAULT_OPTIONS.merge(options)
|
18
|
+
@curl_args = ["--silent", "--fail", "--user-agent", @options[:user_agent], "--max-time", @options[:max_time], "--retry", @options[:retry], "--location", "--max-redirs", "3"]
|
19
|
+
@last_request = Time.at(0)
|
20
|
+
|
21
|
+
@root = @options[:dir]
|
22
|
+
if !@root
|
23
|
+
if File.exists?(ENV["HOME"]) && File.stat(ENV["HOME"]).writable?
|
24
|
+
@root = "#{ENV["HOME"]}/.sinew"
|
25
|
+
else
|
26
|
+
@root = "/tmp/sinew"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def get(url)
|
32
|
+
curl(url, nil)
|
33
|
+
end
|
34
|
+
|
35
|
+
def post(url, body)
|
36
|
+
curl(url, body)
|
37
|
+
end
|
38
|
+
|
39
|
+
def curl(url, body)
|
40
|
+
#
|
41
|
+
# prepare url/uri and calculate paths
|
42
|
+
#
|
43
|
+
|
44
|
+
@uri = url.is_a?(URI) ? url : Curler.url_to_uri(url.to_s)
|
45
|
+
@url = @uri.to_s
|
46
|
+
|
47
|
+
path = fullpath(@uri)
|
48
|
+
path = "#{path},#{Util.pathify(body)}" if body
|
49
|
+
|
50
|
+
# shorten long paths
|
51
|
+
if path.length > 250
|
52
|
+
dir, base = File.dirname(path), File.basename(path)
|
53
|
+
path = "#{dir}/#{Util.md5(base)}"
|
54
|
+
end
|
55
|
+
|
56
|
+
head = "#{File.dirname(path)}/head/#{File.basename(path)}"
|
57
|
+
|
58
|
+
if !File.exists?(path)
|
59
|
+
verbose(body ? "curl #{@url} (POST)" : "curl #{@url}")
|
60
|
+
tmp = "/tmp/curler_#{Util.random_text(6)}"
|
61
|
+
tmph = "#{tmp}.head"
|
62
|
+
begin
|
63
|
+
rate_limit
|
64
|
+
Util.mkdir_if_necessary(File.dirname(path))
|
65
|
+
Util.mkdir_if_necessary(File.dirname(head))
|
66
|
+
begin
|
67
|
+
command = []
|
68
|
+
command += @curl_args
|
69
|
+
if body
|
70
|
+
command += ["--data-binary", body]
|
71
|
+
command += ["--header", "Content-Type: application/x-www-form-urlencoded"]
|
72
|
+
end
|
73
|
+
command += ["--output", tmp]
|
74
|
+
command += ["--dump-header", tmph]
|
75
|
+
command << @url
|
76
|
+
|
77
|
+
Util.run("curl", command)
|
78
|
+
|
79
|
+
# empty response?
|
80
|
+
if !File.exists?(tmp)
|
81
|
+
Util.touch(tmp)
|
82
|
+
Util.touch(tmph)
|
83
|
+
end
|
84
|
+
rescue Util::RunError => e
|
85
|
+
message = "curl error"
|
86
|
+
if e.message =~ /(\d+)$/
|
87
|
+
message = "#{message} (#{$1})"
|
88
|
+
end
|
89
|
+
|
90
|
+
# cache the error?
|
91
|
+
if @options[:cache_errors]
|
92
|
+
File.open(path, "w") { |f| f.puts "" }
|
93
|
+
File.open(head, "w") { |f| f.puts "CURLER_ERROR\t#{message}" }
|
94
|
+
end
|
95
|
+
|
96
|
+
raise Error, message
|
97
|
+
end
|
98
|
+
Util.mv(tmp, path)
|
99
|
+
Util.mv(tmph, head)
|
100
|
+
ensure
|
101
|
+
Util.rm_if_necessary(tmp)
|
102
|
+
Util.rm_if_necessary(tmph)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
#
|
107
|
+
# handle redirects (recalculate @uri/@url)
|
108
|
+
#
|
109
|
+
|
110
|
+
if File.exists?(head)
|
111
|
+
head_contents = File.read(head)
|
112
|
+
# handle cached errors
|
113
|
+
if head_contents =~ /^CURLER_ERROR\t(.*)/
|
114
|
+
raise Error, $1
|
115
|
+
end
|
116
|
+
original = @uri
|
117
|
+
head_contents.scan(/\A(HTTP\/\d\.\d (\d+).*?\r\n\r\n)/m) do |i|
|
118
|
+
headers, code = $1, $2
|
119
|
+
if code =~ /^3/
|
120
|
+
if redir = headers[/^Location: ([^\r\n]+)/, 1]
|
121
|
+
@uri += redir
|
122
|
+
@url = @uri.to_s
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
# kill unnecessary head files
|
127
|
+
if original == @uri
|
128
|
+
Util.rm(head)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
path
|
133
|
+
end
|
134
|
+
|
135
|
+
def verbose(s)
|
136
|
+
$stderr.puts s if @options[:verbose]
|
137
|
+
end
|
138
|
+
|
139
|
+
#
|
140
|
+
# helpers
|
141
|
+
#
|
142
|
+
|
143
|
+
def fullpath(uri)
|
144
|
+
"#{@root}/#{Curler.uri_to_path(uri)}"
|
145
|
+
end
|
146
|
+
|
147
|
+
def uncache!(url)
|
148
|
+
Util.rm_if_necessary("#{@root}/#{Curler.url_to_path(url)}")
|
149
|
+
end
|
150
|
+
|
151
|
+
def self.url_to_uri(url)
|
152
|
+
url = url.gsub(" ", "%20")
|
153
|
+
url = url.gsub("'", "%27")
|
154
|
+
URI.parse(url)
|
155
|
+
end
|
156
|
+
|
157
|
+
def self.url_to_path(url)
|
158
|
+
uri_to_path(url_to_uri(url))
|
159
|
+
end
|
160
|
+
|
161
|
+
def self.uri_to_path(uri)
|
162
|
+
s = uri.path
|
163
|
+
s = "#{s}?#{uri.query}" if uri.query
|
164
|
+
"#{Util.pathify(uri.host)}/#{Util.pathify(s)}"
|
165
|
+
end
|
166
|
+
|
167
|
+
def rate_limit
|
168
|
+
sleep = (@last_request + 1) - Time.now
|
169
|
+
sleep(sleep) if sleep > 0
|
170
|
+
@last_request = Time.now
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
data/lib/sinew/main.rb
ADDED
@@ -0,0 +1,172 @@
|
|
1
|
+
require "nokogiri" # must be loaded before awesome_print
|
2
|
+
require "awesome_print"
|
3
|
+
require "cgi"
|
4
|
+
require "csv"
|
5
|
+
require "htmlentities"
|
6
|
+
require "stringex"
|
7
|
+
|
8
|
+
module Sinew
|
9
|
+
class Main
|
10
|
+
CODER = HTMLEntities.new
|
11
|
+
|
12
|
+
attr_accessor :url, :uri, :raw
|
13
|
+
|
14
|
+
def initialize(options)
|
15
|
+
@options = options.dup
|
16
|
+
@csv = @path = nil
|
17
|
+
|
18
|
+
@curler = Curler.new(user_agent: "sinew/#{VERSION}")
|
19
|
+
|
20
|
+
file = @options[:file]
|
21
|
+
if !File.exists?(file)
|
22
|
+
Util.fatal("#{file} not found")
|
23
|
+
end
|
24
|
+
|
25
|
+
tm = Time.now
|
26
|
+
instance_eval(File.read(file, mode: "rb"), file)
|
27
|
+
if @path
|
28
|
+
Util.banner("Finished #{@path} in #{(Time.now - tm).to_i}s.")
|
29
|
+
else
|
30
|
+
Util.banner("Finished in #{(Time.now - tm).to_i}s.")
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def get(url, params = nil)
|
35
|
+
http(url, params, :get)
|
36
|
+
end
|
37
|
+
|
38
|
+
def post(url, params = nil)
|
39
|
+
http(url, params, :post)
|
40
|
+
end
|
41
|
+
|
42
|
+
def http(url, params, method)
|
43
|
+
url = url.to_s
|
44
|
+
raise "invalid url #{url.inspect}" if url !~ /^http/i
|
45
|
+
|
46
|
+
# decode entities
|
47
|
+
url = CODER.decode(url)
|
48
|
+
|
49
|
+
# handle params
|
50
|
+
body = nil
|
51
|
+
if params
|
52
|
+
q = params.map { |key, value| [CGI.escape(key.to_s), CGI.escape(value.to_s)] }.sort
|
53
|
+
q = q.map { |key, value| "#{key}=#{value}" }.join("&")
|
54
|
+
if method == :get
|
55
|
+
separator = url.include?(??) ? "&" : "?"
|
56
|
+
url = "#{url}#{separator}#{q}"
|
57
|
+
else
|
58
|
+
body = q
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
begin
|
63
|
+
if method == :get
|
64
|
+
path = @curler.get(url)
|
65
|
+
else
|
66
|
+
path = @curler.post(url, body)
|
67
|
+
end
|
68
|
+
@raw = File.read(path, mode: "rb")
|
69
|
+
rescue Curler::Error => e
|
70
|
+
$stderr.puts "xxx #{e.message}"
|
71
|
+
@raw = ""
|
72
|
+
end
|
73
|
+
|
74
|
+
# setup local variables
|
75
|
+
@url, @uri = @curler.url, @curler.uri
|
76
|
+
@html = nil
|
77
|
+
@clean = nil
|
78
|
+
@noko = nil
|
79
|
+
|
80
|
+
nil
|
81
|
+
end
|
82
|
+
|
83
|
+
#
|
84
|
+
# lazy accessors for cleaned up version
|
85
|
+
#
|
86
|
+
|
87
|
+
def html
|
88
|
+
@html ||= begin
|
89
|
+
s = TextUtil.html_tidy(@raw)
|
90
|
+
nelements = @raw.count("<")
|
91
|
+
if nelements > 1
|
92
|
+
# is there a problem with tidy?
|
93
|
+
percent = 100 * s.count("<") / nelements
|
94
|
+
if percent < 80
|
95
|
+
# bad xml processing instruction? Try fixing it.
|
96
|
+
maybe = TextUtil.html_tidy(@raw.gsub(/<\?[^>]*?>/, ""))
|
97
|
+
new_percent = 100 * maybe.count("<") / nelements
|
98
|
+
if new_percent > 80
|
99
|
+
# yes!
|
100
|
+
s = maybe
|
101
|
+
else
|
102
|
+
Util.warning "Hm - it looks like tidy ate some of your file (#{percent}%)" if percent < 90
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
s
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
def clean
|
111
|
+
@clean ||= TextUtil.html_clean_from_tidy(self.html)
|
112
|
+
end
|
113
|
+
|
114
|
+
def noko
|
115
|
+
@noko ||= Nokogiri::HTML(html)
|
116
|
+
end
|
117
|
+
|
118
|
+
#
|
119
|
+
# csv
|
120
|
+
#
|
121
|
+
|
122
|
+
def csv_header(*args)
|
123
|
+
args = args.flatten
|
124
|
+
if args.first.is_a?(String)
|
125
|
+
file = args.shift
|
126
|
+
if file !~ /^\//
|
127
|
+
file = "#{File.dirname(@options[:file])}/#{file}"
|
128
|
+
end
|
129
|
+
else
|
130
|
+
file = @options[:file]
|
131
|
+
end
|
132
|
+
ext = File.extname(file)
|
133
|
+
file = ext.empty? ? "#{file}.csv" : file.gsub(ext, ".csv")
|
134
|
+
|
135
|
+
@path = file
|
136
|
+
@csv = CSV.open(file, "wb")
|
137
|
+
@csv_keys = args
|
138
|
+
@csv << @csv_keys
|
139
|
+
Util.banner("Writing to #{@path}...")
|
140
|
+
end
|
141
|
+
|
142
|
+
def normalize(key, s)
|
143
|
+
case s
|
144
|
+
when Nokogiri::XML::Element, Nokogiri::XML::NodeSet
|
145
|
+
s = s.inner_html
|
146
|
+
when Array
|
147
|
+
s = s.map { |j| j.to_s }.join("|")
|
148
|
+
else
|
149
|
+
s = s.to_s
|
150
|
+
end
|
151
|
+
s = TextUtil.untag(s)
|
152
|
+
s = s.convert_accented_entities
|
153
|
+
s = TextUtil.unent(s)
|
154
|
+
s = s.to_ascii.squish
|
155
|
+
s
|
156
|
+
end
|
157
|
+
|
158
|
+
def csv_emit(row, options = {})
|
159
|
+
csv_header(row.keys.sort) if !@csv
|
160
|
+
|
161
|
+
print = { }
|
162
|
+
row = @csv_keys.map do |i|
|
163
|
+
s = normalize(i, row[i])
|
164
|
+
print[i] = s if !s.empty?
|
165
|
+
s
|
166
|
+
end
|
167
|
+
$stderr.puts print.ai if @options[:verbose]
|
168
|
+
@csv << row
|
169
|
+
@csv.flush
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require "nokogiri"
|
2
|
+
|
3
|
+
# modify NodeSet to join with SPACE instead of empty string
|
4
|
+
class Nokogiri::XML::NodeSet
|
5
|
+
alias :old_inner_html :inner_html
|
6
|
+
alias :old_inner_text :inner_text
|
7
|
+
|
8
|
+
def inner_text
|
9
|
+
collect { |i| i.inner_text }.join(" ")
|
10
|
+
end
|
11
|
+
def inner_html *args
|
12
|
+
collect { |i| i.inner_html(*args) }.join(" ")
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
# text_just_me
|
17
|
+
class Nokogiri::XML::Node
|
18
|
+
def text_just_me
|
19
|
+
t = children.find { |i| i.node_type == Nokogiri::XML::Node::TEXT_NODE }
|
20
|
+
t && t.text
|
21
|
+
end
|
22
|
+
end
|
23
|
+
class Nokogiri::XML::NodeSet
|
24
|
+
def text_just_me
|
25
|
+
map { |i| i.text_just_me }.join(" ")
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,103 @@
|
|
1
|
+
require "active_support/core_ext"
|
2
|
+
require "set"
|
3
|
+
|
4
|
+
module Sinew
|
5
|
+
module TextUtil
|
6
|
+
extend self
|
7
|
+
|
8
|
+
ATTRS_KEEP = Set.new %w(a img iframe)
|
9
|
+
TIDY_OPTIONS = {
|
10
|
+
"-asxml" => nil,
|
11
|
+
"-bare" => nil,
|
12
|
+
"-quiet" => nil,
|
13
|
+
"-utf8" => nil,
|
14
|
+
"-wrap" => 0,
|
15
|
+
"--doctype" => "omit",
|
16
|
+
"--hide-comments" => "yes",
|
17
|
+
"--numeric-entities" => "no",
|
18
|
+
"--preserve-entities" => "yes",
|
19
|
+
"--force-output" => "yes",
|
20
|
+
"-f" => "/dev/null",
|
21
|
+
}
|
22
|
+
|
23
|
+
XML_ENTITIES = { "&"=>"&", "<"=>"<", ">"=>">", "'"=>"'", '"'=>""" }
|
24
|
+
XML_ENTITIES_INV = XML_ENTITIES.invert
|
25
|
+
COMMON_ENTITIES_INV = XML_ENTITIES_INV.merge(
|
26
|
+
"½" => "1/2",
|
27
|
+
"¼" => "1/4",
|
28
|
+
"¾" => "3/4",
|
29
|
+
"“" => '"',
|
30
|
+
"‘" => "'",
|
31
|
+
"—" => "-",
|
32
|
+
" " => " ",
|
33
|
+
"–" => "-",
|
34
|
+
"”" => '"',
|
35
|
+
"’" => "'",
|
36
|
+
"˜" => "~",
|
37
|
+
""" => '"',
|
38
|
+
"'" => "'",
|
39
|
+
" " => " ",
|
40
|
+
"
" => "\n"
|
41
|
+
)
|
42
|
+
|
43
|
+
#
|
44
|
+
# tidy/clean
|
45
|
+
#
|
46
|
+
|
47
|
+
def html_tidy(s)
|
48
|
+
# run tidy
|
49
|
+
args = TIDY_OPTIONS.map { |k, v| "#{k} #{v}" }.join(" ")
|
50
|
+
s = IO.popen("tidy #{args}", "rb+") do |f|
|
51
|
+
f.write(s)
|
52
|
+
f.close_write
|
53
|
+
f.read
|
54
|
+
end
|
55
|
+
raise "could not run tidy" if ($? >> 8) > 2
|
56
|
+
|
57
|
+
# now kill some tags
|
58
|
+
s.sub!(/<html\b[^>]+>/, "<html>")
|
59
|
+
s.gsub!(/<\/?(meta|link)\b[^>]*>/m, "")
|
60
|
+
s.gsub!(/<(style|script)\b[^>]*(\/>|>.*?<\/\1\b>)/m, "")
|
61
|
+
s.gsub!(/<\?[^>]*>/m, "")
|
62
|
+
s.squish!
|
63
|
+
|
64
|
+
# kill whitespace around tags
|
65
|
+
s.gsub!(/ ?<([^>]+)> ?/, "<\\1>")
|
66
|
+
|
67
|
+
s
|
68
|
+
end
|
69
|
+
|
70
|
+
def html_clean(s)
|
71
|
+
html_clean_from_tidy(html_tidy(s))
|
72
|
+
end
|
73
|
+
|
74
|
+
def html_clean_from_tidy(s)
|
75
|
+
# then kill most attrs
|
76
|
+
s = s.dup
|
77
|
+
s.gsub!(/<([^\s>]+)[^>]*?(\/)?>/) do |i|
|
78
|
+
ATTRS_KEEP.include?($1) ? i : "<#{$1}#{$2}>"
|
79
|
+
end
|
80
|
+
s
|
81
|
+
end
|
82
|
+
|
83
|
+
#
|
84
|
+
# untag/unent
|
85
|
+
#
|
86
|
+
|
87
|
+
def xml_escape(s)
|
88
|
+
s.gsub(/[&<>'"]/) { |i| XML_ENTITIES[i] }
|
89
|
+
end
|
90
|
+
|
91
|
+
def xml_unescape(s)
|
92
|
+
s.gsub(/&(amp|lt|gt|apos|quot);/) { |i| XML_ENTITIES_INV[i] }
|
93
|
+
end
|
94
|
+
|
95
|
+
def untag(s)
|
96
|
+
s.gsub(/<[^>]+>/, " ")
|
97
|
+
end
|
98
|
+
|
99
|
+
def unent(s)
|
100
|
+
s.gsub(/&#?[a-z0-9]{2,};/) { |i| COMMON_ENTITIES_INV[i] }
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
data/lib/sinew/util.rb
ADDED
@@ -0,0 +1,236 @@
|
|
1
|
+
require "digest/md5"
|
2
|
+
require "etc"
|
3
|
+
require "fileutils"
|
4
|
+
|
5
|
+
module Sinew
|
6
|
+
# Helper module for executing commands and printing stuff
|
7
|
+
# out.
|
8
|
+
#
|
9
|
+
# The general idea is to only print commands that are actually
|
10
|
+
# interesting. For example, mkdir_if_necessary won't print anything
|
11
|
+
# if the directory already exists. That way we can scan output and
|
12
|
+
# see what changes were made without getting lost in repetitive
|
13
|
+
# commands that had no actual effect.
|
14
|
+
module Util
|
15
|
+
class RunError < StandardError ; end
|
16
|
+
|
17
|
+
extend self
|
18
|
+
|
19
|
+
RESET = "\e[0m"
|
20
|
+
RED = "\e[1;37;41m"
|
21
|
+
GREEN = "\e[1;37;42m"
|
22
|
+
YELLOW = "\e[1;37;43m"
|
23
|
+
BLUE = "\e[1;37;44m"
|
24
|
+
MAGENTA = "\e[1;37;45m"
|
25
|
+
CYAN = "\e[1;37;46m"
|
26
|
+
|
27
|
+
#
|
28
|
+
# running commands
|
29
|
+
#
|
30
|
+
|
31
|
+
# Make all commands echo before running.
|
32
|
+
def run_verbose!
|
33
|
+
@run_verbose = true
|
34
|
+
end
|
35
|
+
|
36
|
+
# Run a command, raise an error upon failure. Output goes to
|
37
|
+
# $stdout/$stderr.
|
38
|
+
def run(command, args = nil)
|
39
|
+
line = nil
|
40
|
+
if args
|
41
|
+
args = args.map(&:to_s)
|
42
|
+
line = "#{command} #{args.join(" ")}"
|
43
|
+
vputs line
|
44
|
+
system(command, *args)
|
45
|
+
else
|
46
|
+
line = command
|
47
|
+
vputs line
|
48
|
+
system(command)
|
49
|
+
end
|
50
|
+
if $? != 0
|
51
|
+
if $?.termsig == Signal.list["INT"]
|
52
|
+
raise "#{line} interrupted"
|
53
|
+
end
|
54
|
+
raise RunError, "#{line} failed : #{$?.to_i / 256}"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# Like mkdir -p. Optionally, set the owner and mode.
|
59
|
+
def mkdir(dir, owner = nil, mode = nil)
|
60
|
+
FileUtils.mkdir_p(dir, :verbose => verbose?)
|
61
|
+
chmod(dir, mode) if mode
|
62
|
+
chown(dir, owner) if owner
|
63
|
+
end
|
64
|
+
|
65
|
+
# mkdir only if the directory doesn't already exist. Optionally,
|
66
|
+
# set the owner and mode.
|
67
|
+
def mkdir_if_necessary(dir, owner = nil, mode = nil)
|
68
|
+
mkdir(dir, owner, mode) if !(File.exists?(dir) || File.symlink?(dir))
|
69
|
+
end
|
70
|
+
|
71
|
+
# rm a dir and recreate it.
|
72
|
+
def rm_and_mkdir(dir)
|
73
|
+
raise "don't do this" if dir == ""
|
74
|
+
run "rm -rf #{dir} && mkdir -p #{dir}"
|
75
|
+
end
|
76
|
+
|
77
|
+
# Are two files different?
|
78
|
+
def different?(a, b)
|
79
|
+
!FileUtils.compare_file(a, b)
|
80
|
+
end
|
81
|
+
|
82
|
+
# Copy file or dir from src to dst. Optionally, set the mode and
|
83
|
+
# owner of dst.
|
84
|
+
def cp(src, dst, owner = nil, mode = nil)
|
85
|
+
FileUtils.cp_r(src, dst, :preserve => true, :verbose => verbose?)
|
86
|
+
if owner && !File.symlink?(dst)
|
87
|
+
chown(dst, owner)
|
88
|
+
end
|
89
|
+
if mode
|
90
|
+
chmod(dst, mode)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
# Copy file or dir from src to dst, but create the dst directory
|
95
|
+
# first if necessary. Optionally, set the mode and owner of dst.
|
96
|
+
def cp_with_mkdir(src, dst, owner = nil, mode = nil)
|
97
|
+
mkdir_if_necessary(File.dirname(dst))
|
98
|
+
cp(src, dst, owner, mode)
|
99
|
+
end
|
100
|
+
|
101
|
+
# Copy file or dir from src to dst, but ONLY if dst doesn't exist
|
102
|
+
# or has different contents than src. Optionally, set the mode and
|
103
|
+
# owner of dst.
|
104
|
+
def cp_if_necessary(src, dst, owner = nil, mode = nil)
|
105
|
+
if !File.exists?(dst) || different?(src, dst)
|
106
|
+
cp(src, dst, owner, mode)
|
107
|
+
true
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
# Move src to dst. Because this uses FileUtils, it works even if
|
112
|
+
# dst is on a different partition.
|
113
|
+
def mv(src, dst)
|
114
|
+
FileUtils.mv(src, dst, :verbose => verbose?)
|
115
|
+
end
|
116
|
+
|
117
|
+
# Move src to dst, but create the dst directory first if
|
118
|
+
# necessary.
|
119
|
+
def mv_with_mkdir(src, dst)
|
120
|
+
mkdir_if_necessary(File.dirname(dst))
|
121
|
+
mv(src, dst)
|
122
|
+
end
|
123
|
+
|
124
|
+
# Chown file to be owned by user.
|
125
|
+
def chown(file, user)
|
126
|
+
user = user.to_s
|
127
|
+
# who is the current owner?
|
128
|
+
@uids ||= {}
|
129
|
+
@uids[user] ||= Etc.getpwnam(user).uid
|
130
|
+
uid = @uids[user]
|
131
|
+
if File.stat(file).uid != uid
|
132
|
+
run "chown #{user}:#{user} '#{file}'"
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
# Chmod file to a new mode.
|
137
|
+
def chmod(file, mode)
|
138
|
+
if File.stat(file).mode != mode
|
139
|
+
FileUtils.chmod(mode, file, :verbose => verbose?)
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
# rm a file
|
144
|
+
def rm(file)
|
145
|
+
FileUtils.rm(file, :force => true, :verbose => verbose?)
|
146
|
+
end
|
147
|
+
|
148
|
+
# rm a file, but only if it exists.
|
149
|
+
def rm_if_necessary(file)
|
150
|
+
if File.exists?(file)
|
151
|
+
rm(file)
|
152
|
+
true
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
# Create a symlink from src to dst.
|
157
|
+
def ln(src, dst)
|
158
|
+
FileUtils.ln_sf(src, dst, :verbose => verbose?)
|
159
|
+
end
|
160
|
+
|
161
|
+
# Create a symlink from src to dst, but only if it hasn't already
|
162
|
+
# been created.
|
163
|
+
def ln_if_necessary(src, dst)
|
164
|
+
ln = false
|
165
|
+
if !File.symlink?(dst)
|
166
|
+
ln = true
|
167
|
+
elsif File.readlink(dst) != src
|
168
|
+
rm(dst)
|
169
|
+
ln = true
|
170
|
+
end
|
171
|
+
if ln
|
172
|
+
ln(src, dst)
|
173
|
+
true
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
# Touch a file
|
178
|
+
def touch(file)
|
179
|
+
FileUtils.touch(file)
|
180
|
+
end
|
181
|
+
|
182
|
+
# A nice printout in green.
|
183
|
+
def banner(s, color = GREEN)
|
184
|
+
s = "#{s} ".ljust(72, " ")
|
185
|
+
$stderr.write "#{color}[#{Time.new.strftime('%H:%M:%S')}] #{s}#{RESET}\n"
|
186
|
+
$stderr.flush
|
187
|
+
end
|
188
|
+
|
189
|
+
# Print a warning in yellow.
|
190
|
+
def warning(msg)
|
191
|
+
banner("Warning: #{msg}", YELLOW)
|
192
|
+
end
|
193
|
+
|
194
|
+
# Print a fatal error in red, then exit.
|
195
|
+
def fatal(msg)
|
196
|
+
banner(msg, RED)
|
197
|
+
exit(1)
|
198
|
+
end
|
199
|
+
|
200
|
+
# Generate some random text
|
201
|
+
def random_text(len)
|
202
|
+
chars = ("A".."Z").to_a + ("a".."z").to_a + ("0".."9").to_a
|
203
|
+
(1..len).map { chars[rand(chars.length - 1)] }.join("")
|
204
|
+
end
|
205
|
+
|
206
|
+
# Convert a string into something that could be a path segment
|
207
|
+
def pathify(s)
|
208
|
+
s = s.gsub(/^\//, "")
|
209
|
+
s = s.gsub("..", ",")
|
210
|
+
s = s.gsub(/[?\/&]/, ",")
|
211
|
+
s = s.gsub(/[^A-Za-z0-9_.,=-]/) do |i|
|
212
|
+
hex = i.unpack("H2").first
|
213
|
+
"%#{hex}"
|
214
|
+
end
|
215
|
+
s = "_root_" if s.empty?
|
216
|
+
s = s.downcase
|
217
|
+
s
|
218
|
+
end
|
219
|
+
|
220
|
+
# checksum some text
|
221
|
+
def md5(s)
|
222
|
+
Digest::MD5.hexdigest(s.to_s)
|
223
|
+
end
|
224
|
+
|
225
|
+
private
|
226
|
+
|
227
|
+
# Returns true if verbosity is turned on.
|
228
|
+
def verbose?
|
229
|
+
@run_verbose ||= nil
|
230
|
+
end
|
231
|
+
|
232
|
+
def vputs(s)
|
233
|
+
$stderr.puts s if verbose?
|
234
|
+
end
|
235
|
+
end
|
236
|
+
end
|
data/sample.sinew
ADDED
@@ -0,0 +1,8 @@
|
|
1
|
+
get "http://www.amazon.com/gp/bestsellers/books/ref=sv_b_3"
|
2
|
+
noko.css(".zg_itemRow").each do |item|
|
3
|
+
row = { }
|
4
|
+
row[:url] = item.css(".zg_title a").first[:href]
|
5
|
+
row[:title] = item.css(".zg_title")
|
6
|
+
row[:img] = item.css(".zg_itemImage_normal img").first[:src]
|
7
|
+
csv_emit(row)
|
8
|
+
end
|
data/sinew.gemspec
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
$LOAD_PATH << File.expand_path("../lib", __FILE__)
|
2
|
+
|
3
|
+
require "sinew/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "sinew"
|
7
|
+
s.version = Sinew::VERSION
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.authors = ["Adam Doppelt"]
|
10
|
+
s.email = ["amd@gurge.com"]
|
11
|
+
s.homepage = "http://github.com/gurgeous/sinew"
|
12
|
+
s.summary = "Sinew - structured web crawling using recipes."
|
13
|
+
s.description = "Crawl web sites easily using ruby recipes, with caching and nokogiri."
|
14
|
+
|
15
|
+
s.rubyforge_project = "sinew"
|
16
|
+
|
17
|
+
s.add_runtime_dependency "activesupport"
|
18
|
+
s.add_runtime_dependency "awesome_print"
|
19
|
+
s.add_runtime_dependency "htmlentities"
|
20
|
+
s.add_runtime_dependency "nokogiri"
|
21
|
+
s.add_runtime_dependency "stringex"
|
22
|
+
s.add_runtime_dependency "trollop"
|
23
|
+
s.add_development_dependency "rake"
|
24
|
+
|
25
|
+
s.files = `git ls-files`.split("\n")
|
26
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
27
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
28
|
+
s.require_paths = ["lib"]
|
29
|
+
end
|
metadata
ADDED
@@ -0,0 +1,179 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: sinew
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Adam Doppelt
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-06-04 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: activesupport
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: awesome_print
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: htmlentities
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: nokogiri
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :runtime
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: stringex
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
type: :runtime
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
94
|
+
- !ruby/object:Gem::Dependency
|
95
|
+
name: trollop
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
type: :runtime
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
110
|
+
- !ruby/object:Gem::Dependency
|
111
|
+
name: rake
|
112
|
+
requirement: !ruby/object:Gem::Requirement
|
113
|
+
none: false
|
114
|
+
requirements:
|
115
|
+
- - ! '>='
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
122
|
+
requirements:
|
123
|
+
- - ! '>='
|
124
|
+
- !ruby/object:Gem::Version
|
125
|
+
version: '0'
|
126
|
+
description: Crawl web sites easily using ruby recipes, with caching and nokogiri.
|
127
|
+
email:
|
128
|
+
- amd@gurge.com
|
129
|
+
executables:
|
130
|
+
- sinew
|
131
|
+
extensions: []
|
132
|
+
extra_rdoc_files: []
|
133
|
+
files:
|
134
|
+
- .gitignore
|
135
|
+
- Gemfile
|
136
|
+
- LICENSE
|
137
|
+
- README.md
|
138
|
+
- Rakefile
|
139
|
+
- bin/sinew
|
140
|
+
- lib/sinew.rb
|
141
|
+
- lib/sinew/curler.rb
|
142
|
+
- lib/sinew/main.rb
|
143
|
+
- lib/sinew/nokogiri_ext.rb
|
144
|
+
- lib/sinew/text_util.rb
|
145
|
+
- lib/sinew/util.rb
|
146
|
+
- lib/sinew/version.rb
|
147
|
+
- sample.sinew
|
148
|
+
- sinew.gemspec
|
149
|
+
homepage: http://github.com/gurgeous/sinew
|
150
|
+
licenses: []
|
151
|
+
post_install_message:
|
152
|
+
rdoc_options: []
|
153
|
+
require_paths:
|
154
|
+
- lib
|
155
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
156
|
+
none: false
|
157
|
+
requirements:
|
158
|
+
- - ! '>='
|
159
|
+
- !ruby/object:Gem::Version
|
160
|
+
version: '0'
|
161
|
+
segments:
|
162
|
+
- 0
|
163
|
+
hash: 106543959769779396
|
164
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
165
|
+
none: false
|
166
|
+
requirements:
|
167
|
+
- - ! '>='
|
168
|
+
- !ruby/object:Gem::Version
|
169
|
+
version: '0'
|
170
|
+
segments:
|
171
|
+
- 0
|
172
|
+
hash: 106543959769779396
|
173
|
+
requirements: []
|
174
|
+
rubyforge_project: sinew
|
175
|
+
rubygems_version: 1.8.21
|
176
|
+
signing_key:
|
177
|
+
specification_version: 3
|
178
|
+
summary: Sinew - structured web crawling using recipes.
|
179
|
+
test_files: []
|