sinew 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +6 -0
- data/Gemfile +2 -0
- data/LICENSE +20 -0
- data/README.md +34 -0
- data/Rakefile +30 -0
- data/bin/sinew +16 -0
- data/lib/sinew.rb +6 -0
- data/lib/sinew/curler.rb +173 -0
- data/lib/sinew/main.rb +172 -0
- data/lib/sinew/nokogiri_ext.rb +27 -0
- data/lib/sinew/text_util.rb +103 -0
- data/lib/sinew/util.rb +236 -0
- data/lib/sinew/version.rb +4 -0
- data/sample.sinew +8 -0
- data/sinew.gemspec +29 -0
- metadata +179 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2012 Adam Doppelt
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
## Welcome to Sinew
|
2
|
+
|
3
|
+
Sinew collects structured data from web sites (screen scraping). It provides a Ruby DSL built for crawling, a robust caching system, and integration with [Nokogiri](http://nokogiri.org). Though small, this project is the culmination of years of effort based on crawling systems built at several different companies.
|
4
|
+
|
5
|
+
Sinew requires Ruby 1.9, [HTML Tidy](http://tidy.sourceforge.net) and [Curl](http://curl.haxx.se).
|
6
|
+
|
7
|
+
## Example
|
8
|
+
|
9
|
+
Here's an example for collecting Amazon's bestseller list:
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
# get the url
|
13
|
+
get "http://www.amazon.com/gp/bestsellers/books/ref=sv_b_3"
|
14
|
+
|
15
|
+
# use nokogiri to find books
|
16
|
+
noko.css(".zg_itemRow").each do |item|
|
17
|
+
# pull out the stuff we care about using nokogiri
|
18
|
+
row = { }
|
19
|
+
row[:url] = item.css(".zg_title a").first[:href]
|
20
|
+
row[:title] = item.css(".zg_title")
|
21
|
+
row[:img] = item.css(".zg_itemImage_normal img").first[:src]
|
22
|
+
|
23
|
+
# append a row to the csv
|
24
|
+
csv_emit(row)
|
25
|
+
end
|
26
|
+
```
|
27
|
+
|
28
|
+
If you paste this into a file called `bestsellers.sinew` and run `sinew bestsellers.sinew`, it will create a `bestsellers.csv` file containing the url, title and img for each bestseller.
|
29
|
+
|
30
|
+
## Full Documentation
|
31
|
+
|
32
|
+
Full docs are in the wiki:
|
33
|
+
|
34
|
+
https://github.com/gurgeous/sinew/wiki
|
data/Rakefile
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
require "bundler"
|
2
|
+
require "bundler/setup"
|
3
|
+
require "rake"
|
4
|
+
|
5
|
+
$LOAD_PATH << File.expand_path("../lib", __FILE__)
|
6
|
+
require "sinew/version"
|
7
|
+
|
8
|
+
#
|
9
|
+
# gem
|
10
|
+
#
|
11
|
+
|
12
|
+
task :gem => :build
|
13
|
+
task :build do
|
14
|
+
system "gem build --quiet sinew.gemspec"
|
15
|
+
end
|
16
|
+
|
17
|
+
task :install => :build do
|
18
|
+
system "sudo gem install --quiet sinew-#{Sinew::VERSION}.gem"
|
19
|
+
end
|
20
|
+
|
21
|
+
task :release => :build do
|
22
|
+
system "git tag -a #{Sinew::VERSION} -m 'Tagging #{Sinew::VERSION}'"
|
23
|
+
system "git push --tags"
|
24
|
+
system "gem push sinew-#{Sinew::VERSION}.gem"
|
25
|
+
end
|
26
|
+
|
27
|
+
task :default => :gem
|
28
|
+
|
29
|
+
# to test:
|
30
|
+
# block ; rake install && rm -rf ~/.sinew/www.amazon.com && /usr/local/bin/sinew sample.sinew
|
data/bin/sinew
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "sinew"
|
4
|
+
require "trollop"
|
5
|
+
|
6
|
+
# ARGV
|
7
|
+
options = Trollop.options do
|
8
|
+
banner "Usage: sinew [options] <gub.sinew>"
|
9
|
+
opt :verbose, "Dump every row"
|
10
|
+
end
|
11
|
+
Trollop.die "need a .sinew file to run against" if ARGV.blank?
|
12
|
+
|
13
|
+
# now run!
|
14
|
+
ARGV.each do |i|
|
15
|
+
Sinew::Main.new(options.merge(file: i))
|
16
|
+
end
|
data/lib/sinew.rb
ADDED
data/lib/sinew/curler.rb
ADDED
@@ -0,0 +1,173 @@
|
|
1
|
+
require "uri"
|
2
|
+
|
3
|
+
module Sinew
|
4
|
+
class Curler
|
5
|
+
class Error < StandardError ; end
|
6
|
+
|
7
|
+
DEFAULT_OPTIONS = {
|
8
|
+
:cache_errors => true,
|
9
|
+
:max_time => 30,
|
10
|
+
:retry => 3,
|
11
|
+
:verbose => true,
|
12
|
+
}
|
13
|
+
|
14
|
+
attr_reader :url, :uri, :root
|
15
|
+
|
16
|
+
def initialize(options = {})
|
17
|
+
@options = DEFAULT_OPTIONS.merge(options)
|
18
|
+
@curl_args = ["--silent", "--fail", "--user-agent", @options[:user_agent], "--max-time", @options[:max_time], "--retry", @options[:retry], "--location", "--max-redirs", "3"]
|
19
|
+
@last_request = Time.at(0)
|
20
|
+
|
21
|
+
@root = @options[:dir]
|
22
|
+
if !@root
|
23
|
+
if File.exists?(ENV["HOME"]) && File.stat(ENV["HOME"]).writable?
|
24
|
+
@root = "#{ENV["HOME"]}/.sinew"
|
25
|
+
else
|
26
|
+
@root = "/tmp/sinew"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def get(url)
|
32
|
+
curl(url, nil)
|
33
|
+
end
|
34
|
+
|
35
|
+
def post(url, body)
|
36
|
+
curl(url, body)
|
37
|
+
end
|
38
|
+
|
39
|
+
def curl(url, body)
|
40
|
+
#
|
41
|
+
# prepare url/uri and calculate paths
|
42
|
+
#
|
43
|
+
|
44
|
+
@uri = url.is_a?(URI) ? url : Curler.url_to_uri(url.to_s)
|
45
|
+
@url = @uri.to_s
|
46
|
+
|
47
|
+
path = fullpath(@uri)
|
48
|
+
path = "#{path},#{Util.pathify(body)}" if body
|
49
|
+
|
50
|
+
# shorten long paths
|
51
|
+
if path.length > 250
|
52
|
+
dir, base = File.dirname(path), File.basename(path)
|
53
|
+
path = "#{dir}/#{Util.md5(base)}"
|
54
|
+
end
|
55
|
+
|
56
|
+
head = "#{File.dirname(path)}/head/#{File.basename(path)}"
|
57
|
+
|
58
|
+
if !File.exists?(path)
|
59
|
+
verbose(body ? "curl #{@url} (POST)" : "curl #{@url}")
|
60
|
+
tmp = "/tmp/curler_#{Util.random_text(6)}"
|
61
|
+
tmph = "#{tmp}.head"
|
62
|
+
begin
|
63
|
+
rate_limit
|
64
|
+
Util.mkdir_if_necessary(File.dirname(path))
|
65
|
+
Util.mkdir_if_necessary(File.dirname(head))
|
66
|
+
begin
|
67
|
+
command = []
|
68
|
+
command += @curl_args
|
69
|
+
if body
|
70
|
+
command += ["--data-binary", body]
|
71
|
+
command += ["--header", "Content-Type: application/x-www-form-urlencoded"]
|
72
|
+
end
|
73
|
+
command += ["--output", tmp]
|
74
|
+
command += ["--dump-header", tmph]
|
75
|
+
command << @url
|
76
|
+
|
77
|
+
Util.run("curl", command)
|
78
|
+
|
79
|
+
# empty response?
|
80
|
+
if !File.exists?(tmp)
|
81
|
+
Util.touch(tmp)
|
82
|
+
Util.touch(tmph)
|
83
|
+
end
|
84
|
+
rescue Util::RunError => e
|
85
|
+
message = "curl error"
|
86
|
+
if e.message =~ /(\d+)$/
|
87
|
+
message = "#{message} (#{$1})"
|
88
|
+
end
|
89
|
+
|
90
|
+
# cache the error?
|
91
|
+
if @options[:cache_errors]
|
92
|
+
File.open(path, "w") { |f| f.puts "" }
|
93
|
+
File.open(head, "w") { |f| f.puts "CURLER_ERROR\t#{message}" }
|
94
|
+
end
|
95
|
+
|
96
|
+
raise Error, message
|
97
|
+
end
|
98
|
+
Util.mv(tmp, path)
|
99
|
+
Util.mv(tmph, head)
|
100
|
+
ensure
|
101
|
+
Util.rm_if_necessary(tmp)
|
102
|
+
Util.rm_if_necessary(tmph)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
#
|
107
|
+
# handle redirects (recalculate @uri/@url)
|
108
|
+
#
|
109
|
+
|
110
|
+
if File.exists?(head)
|
111
|
+
head_contents = File.read(head)
|
112
|
+
# handle cached errors
|
113
|
+
if head_contents =~ /^CURLER_ERROR\t(.*)/
|
114
|
+
raise Error, $1
|
115
|
+
end
|
116
|
+
original = @uri
|
117
|
+
head_contents.scan(/\A(HTTP\/\d\.\d (\d+).*?\r\n\r\n)/m) do |i|
|
118
|
+
headers, code = $1, $2
|
119
|
+
if code =~ /^3/
|
120
|
+
if redir = headers[/^Location: ([^\r\n]+)/, 1]
|
121
|
+
@uri += redir
|
122
|
+
@url = @uri.to_s
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
# kill unnecessary head files
|
127
|
+
if original == @uri
|
128
|
+
Util.rm(head)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
path
|
133
|
+
end
|
134
|
+
|
135
|
+
def verbose(s)
|
136
|
+
$stderr.puts s if @options[:verbose]
|
137
|
+
end
|
138
|
+
|
139
|
+
#
|
140
|
+
# helpers
|
141
|
+
#
|
142
|
+
|
143
|
+
def fullpath(uri)
|
144
|
+
"#{@root}/#{Curler.uri_to_path(uri)}"
|
145
|
+
end
|
146
|
+
|
147
|
+
def uncache!(url)
|
148
|
+
Util.rm_if_necessary("#{@root}/#{Curler.url_to_path(url)}")
|
149
|
+
end
|
150
|
+
|
151
|
+
def self.url_to_uri(url)
|
152
|
+
url = url.gsub(" ", "%20")
|
153
|
+
url = url.gsub("'", "%27")
|
154
|
+
URI.parse(url)
|
155
|
+
end
|
156
|
+
|
157
|
+
def self.url_to_path(url)
|
158
|
+
uri_to_path(url_to_uri(url))
|
159
|
+
end
|
160
|
+
|
161
|
+
def self.uri_to_path(uri)
|
162
|
+
s = uri.path
|
163
|
+
s = "#{s}?#{uri.query}" if uri.query
|
164
|
+
"#{Util.pathify(uri.host)}/#{Util.pathify(s)}"
|
165
|
+
end
|
166
|
+
|
167
|
+
def rate_limit
|
168
|
+
sleep = (@last_request + 1) - Time.now
|
169
|
+
sleep(sleep) if sleep > 0
|
170
|
+
@last_request = Time.now
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
data/lib/sinew/main.rb
ADDED
@@ -0,0 +1,172 @@
|
|
1
|
+
require "nokogiri" # must be loaded before awesome_print
|
2
|
+
require "awesome_print"
|
3
|
+
require "cgi"
|
4
|
+
require "csv"
|
5
|
+
require "htmlentities"
|
6
|
+
require "stringex"
|
7
|
+
|
8
|
+
module Sinew
|
9
|
+
class Main
|
10
|
+
CODER = HTMLEntities.new
|
11
|
+
|
12
|
+
attr_accessor :url, :uri, :raw
|
13
|
+
|
14
|
+
def initialize(options)
|
15
|
+
@options = options.dup
|
16
|
+
@csv = @path = nil
|
17
|
+
|
18
|
+
@curler = Curler.new(user_agent: "sinew/#{VERSION}")
|
19
|
+
|
20
|
+
file = @options[:file]
|
21
|
+
if !File.exists?(file)
|
22
|
+
Util.fatal("#{file} not found")
|
23
|
+
end
|
24
|
+
|
25
|
+
tm = Time.now
|
26
|
+
instance_eval(File.read(file, mode: "rb"), file)
|
27
|
+
if @path
|
28
|
+
Util.banner("Finished #{@path} in #{(Time.now - tm).to_i}s.")
|
29
|
+
else
|
30
|
+
Util.banner("Finished in #{(Time.now - tm).to_i}s.")
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def get(url, params = nil)
|
35
|
+
http(url, params, :get)
|
36
|
+
end
|
37
|
+
|
38
|
+
def post(url, params = nil)
|
39
|
+
http(url, params, :post)
|
40
|
+
end
|
41
|
+
|
42
|
+
def http(url, params, method)
|
43
|
+
url = url.to_s
|
44
|
+
raise "invalid url #{url.inspect}" if url !~ /^http/i
|
45
|
+
|
46
|
+
# decode entities
|
47
|
+
url = CODER.decode(url)
|
48
|
+
|
49
|
+
# handle params
|
50
|
+
body = nil
|
51
|
+
if params
|
52
|
+
q = params.map { |key, value| [CGI.escape(key.to_s), CGI.escape(value.to_s)] }.sort
|
53
|
+
q = q.map { |key, value| "#{key}=#{value}" }.join("&")
|
54
|
+
if method == :get
|
55
|
+
separator = url.include?(??) ? "&" : "?"
|
56
|
+
url = "#{url}#{separator}#{q}"
|
57
|
+
else
|
58
|
+
body = q
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
begin
|
63
|
+
if method == :get
|
64
|
+
path = @curler.get(url)
|
65
|
+
else
|
66
|
+
path = @curler.post(url, body)
|
67
|
+
end
|
68
|
+
@raw = File.read(path, mode: "rb")
|
69
|
+
rescue Curler::Error => e
|
70
|
+
$stderr.puts "xxx #{e.message}"
|
71
|
+
@raw = ""
|
72
|
+
end
|
73
|
+
|
74
|
+
# setup local variables
|
75
|
+
@url, @uri = @curler.url, @curler.uri
|
76
|
+
@html = nil
|
77
|
+
@clean = nil
|
78
|
+
@noko = nil
|
79
|
+
|
80
|
+
nil
|
81
|
+
end
|
82
|
+
|
83
|
+
#
|
84
|
+
# lazy accessors for cleaned up version
|
85
|
+
#
|
86
|
+
|
87
|
+
def html
|
88
|
+
@html ||= begin
|
89
|
+
s = TextUtil.html_tidy(@raw)
|
90
|
+
nelements = @raw.count("<")
|
91
|
+
if nelements > 1
|
92
|
+
# is there a problem with tidy?
|
93
|
+
percent = 100 * s.count("<") / nelements
|
94
|
+
if percent < 80
|
95
|
+
# bad xml processing instruction? Try fixing it.
|
96
|
+
maybe = TextUtil.html_tidy(@raw.gsub(/<\?[^>]*?>/, ""))
|
97
|
+
new_percent = 100 * maybe.count("<") / nelements
|
98
|
+
if new_percent > 80
|
99
|
+
# yes!
|
100
|
+
s = maybe
|
101
|
+
else
|
102
|
+
Util.warning "Hm - it looks like tidy ate some of your file (#{percent}%)" if percent < 90
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
s
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
def clean
|
111
|
+
@clean ||= TextUtil.html_clean_from_tidy(self.html)
|
112
|
+
end
|
113
|
+
|
114
|
+
def noko
|
115
|
+
@noko ||= Nokogiri::HTML(html)
|
116
|
+
end
|
117
|
+
|
118
|
+
#
|
119
|
+
# csv
|
120
|
+
#
|
121
|
+
|
122
|
+
def csv_header(*args)
|
123
|
+
args = args.flatten
|
124
|
+
if args.first.is_a?(String)
|
125
|
+
file = args.shift
|
126
|
+
if file !~ /^\//
|
127
|
+
file = "#{File.dirname(@options[:file])}/#{file}"
|
128
|
+
end
|
129
|
+
else
|
130
|
+
file = @options[:file]
|
131
|
+
end
|
132
|
+
ext = File.extname(file)
|
133
|
+
file = ext.empty? ? "#{file}.csv" : file.gsub(ext, ".csv")
|
134
|
+
|
135
|
+
@path = file
|
136
|
+
@csv = CSV.open(file, "wb")
|
137
|
+
@csv_keys = args
|
138
|
+
@csv << @csv_keys
|
139
|
+
Util.banner("Writing to #{@path}...")
|
140
|
+
end
|
141
|
+
|
142
|
+
def normalize(key, s)
|
143
|
+
case s
|
144
|
+
when Nokogiri::XML::Element, Nokogiri::XML::NodeSet
|
145
|
+
s = s.inner_html
|
146
|
+
when Array
|
147
|
+
s = s.map { |j| j.to_s }.join("|")
|
148
|
+
else
|
149
|
+
s = s.to_s
|
150
|
+
end
|
151
|
+
s = TextUtil.untag(s)
|
152
|
+
s = s.convert_accented_entities
|
153
|
+
s = TextUtil.unent(s)
|
154
|
+
s = s.to_ascii.squish
|
155
|
+
s
|
156
|
+
end
|
157
|
+
|
158
|
+
def csv_emit(row, options = {})
|
159
|
+
csv_header(row.keys.sort) if !@csv
|
160
|
+
|
161
|
+
print = { }
|
162
|
+
row = @csv_keys.map do |i|
|
163
|
+
s = normalize(i, row[i])
|
164
|
+
print[i] = s if !s.empty?
|
165
|
+
s
|
166
|
+
end
|
167
|
+
$stderr.puts print.ai if @options[:verbose]
|
168
|
+
@csv << row
|
169
|
+
@csv.flush
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require "nokogiri"
|
2
|
+
|
3
|
+
# modify NodeSet to join with SPACE instead of empty string
|
4
|
+
class Nokogiri::XML::NodeSet
|
5
|
+
alias :old_inner_html :inner_html
|
6
|
+
alias :old_inner_text :inner_text
|
7
|
+
|
8
|
+
def inner_text
|
9
|
+
collect { |i| i.inner_text }.join(" ")
|
10
|
+
end
|
11
|
+
def inner_html *args
|
12
|
+
collect { |i| i.inner_html(*args) }.join(" ")
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
# text_just_me
|
17
|
+
class Nokogiri::XML::Node
|
18
|
+
def text_just_me
|
19
|
+
t = children.find { |i| i.node_type == Nokogiri::XML::Node::TEXT_NODE }
|
20
|
+
t && t.text
|
21
|
+
end
|
22
|
+
end
|
23
|
+
class Nokogiri::XML::NodeSet
|
24
|
+
def text_just_me
|
25
|
+
map { |i| i.text_just_me }.join(" ")
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,103 @@
|
|
1
|
+
require "active_support/core_ext"
|
2
|
+
require "set"
|
3
|
+
|
4
|
+
module Sinew
|
5
|
+
module TextUtil
|
6
|
+
extend self
|
7
|
+
|
8
|
+
ATTRS_KEEP = Set.new %w(a img iframe)
|
9
|
+
TIDY_OPTIONS = {
|
10
|
+
"-asxml" => nil,
|
11
|
+
"-bare" => nil,
|
12
|
+
"-quiet" => nil,
|
13
|
+
"-utf8" => nil,
|
14
|
+
"-wrap" => 0,
|
15
|
+
"--doctype" => "omit",
|
16
|
+
"--hide-comments" => "yes",
|
17
|
+
"--numeric-entities" => "no",
|
18
|
+
"--preserve-entities" => "yes",
|
19
|
+
"--force-output" => "yes",
|
20
|
+
"-f" => "/dev/null",
|
21
|
+
}
|
22
|
+
|
23
|
+
XML_ENTITIES = { "&"=>"&", "<"=>"<", ">"=>">", "'"=>"'", '"'=>""" }
|
24
|
+
XML_ENTITIES_INV = XML_ENTITIES.invert
|
25
|
+
COMMON_ENTITIES_INV = XML_ENTITIES_INV.merge(
|
26
|
+
"½" => "1/2",
|
27
|
+
"¼" => "1/4",
|
28
|
+
"¾" => "3/4",
|
29
|
+
"“" => '"',
|
30
|
+
"‘" => "'",
|
31
|
+
"—" => "-",
|
32
|
+
" " => " ",
|
33
|
+
"–" => "-",
|
34
|
+
"”" => '"',
|
35
|
+
"’" => "'",
|
36
|
+
"˜" => "~",
|
37
|
+
""" => '"',
|
38
|
+
"'" => "'",
|
39
|
+
" " => " ",
|
40
|
+
"
" => "\n"
|
41
|
+
)
|
42
|
+
|
43
|
+
#
|
44
|
+
# tidy/clean
|
45
|
+
#
|
46
|
+
|
47
|
+
def html_tidy(s)
|
48
|
+
# run tidy
|
49
|
+
args = TIDY_OPTIONS.map { |k, v| "#{k} #{v}" }.join(" ")
|
50
|
+
s = IO.popen("tidy #{args}", "rb+") do |f|
|
51
|
+
f.write(s)
|
52
|
+
f.close_write
|
53
|
+
f.read
|
54
|
+
end
|
55
|
+
raise "could not run tidy" if ($? >> 8) > 2
|
56
|
+
|
57
|
+
# now kill some tags
|
58
|
+
s.sub!(/<html\b[^>]+>/, "<html>")
|
59
|
+
s.gsub!(/<\/?(meta|link)\b[^>]*>/m, "")
|
60
|
+
s.gsub!(/<(style|script)\b[^>]*(\/>|>.*?<\/\1\b>)/m, "")
|
61
|
+
s.gsub!(/<\?[^>]*>/m, "")
|
62
|
+
s.squish!
|
63
|
+
|
64
|
+
# kill whitespace around tags
|
65
|
+
s.gsub!(/ ?<([^>]+)> ?/, "<\\1>")
|
66
|
+
|
67
|
+
s
|
68
|
+
end
|
69
|
+
|
70
|
+
def html_clean(s)
|
71
|
+
html_clean_from_tidy(html_tidy(s))
|
72
|
+
end
|
73
|
+
|
74
|
+
def html_clean_from_tidy(s)
|
75
|
+
# then kill most attrs
|
76
|
+
s = s.dup
|
77
|
+
s.gsub!(/<([^\s>]+)[^>]*?(\/)?>/) do |i|
|
78
|
+
ATTRS_KEEP.include?($1) ? i : "<#{$1}#{$2}>"
|
79
|
+
end
|
80
|
+
s
|
81
|
+
end
|
82
|
+
|
83
|
+
#
|
84
|
+
# untag/unent
|
85
|
+
#
|
86
|
+
|
87
|
+
def xml_escape(s)
|
88
|
+
s.gsub(/[&<>'"]/) { |i| XML_ENTITIES[i] }
|
89
|
+
end
|
90
|
+
|
91
|
+
def xml_unescape(s)
|
92
|
+
s.gsub(/&(amp|lt|gt|apos|quot);/) { |i| XML_ENTITIES_INV[i] }
|
93
|
+
end
|
94
|
+
|
95
|
+
def untag(s)
|
96
|
+
s.gsub(/<[^>]+>/, " ")
|
97
|
+
end
|
98
|
+
|
99
|
+
def unent(s)
|
100
|
+
s.gsub(/&#?[a-z0-9]{2,};/) { |i| COMMON_ENTITIES_INV[i] }
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
data/lib/sinew/util.rb
ADDED
@@ -0,0 +1,236 @@
|
|
1
|
+
require "digest/md5"
|
2
|
+
require "etc"
|
3
|
+
require "fileutils"
|
4
|
+
|
5
|
+
module Sinew
|
6
|
+
# Helper module for executing commands and printing stuff
|
7
|
+
# out.
|
8
|
+
#
|
9
|
+
# The general idea is to only print commands that are actually
|
10
|
+
# interesting. For example, mkdir_if_necessary won't print anything
|
11
|
+
# if the directory already exists. That way we can scan output and
|
12
|
+
# see what changes were made without getting lost in repetitive
|
13
|
+
# commands that had no actual effect.
|
14
|
+
module Util
|
15
|
+
class RunError < StandardError ; end
|
16
|
+
|
17
|
+
extend self
|
18
|
+
|
19
|
+
RESET = "\e[0m"
|
20
|
+
RED = "\e[1;37;41m"
|
21
|
+
GREEN = "\e[1;37;42m"
|
22
|
+
YELLOW = "\e[1;37;43m"
|
23
|
+
BLUE = "\e[1;37;44m"
|
24
|
+
MAGENTA = "\e[1;37;45m"
|
25
|
+
CYAN = "\e[1;37;46m"
|
26
|
+
|
27
|
+
#
|
28
|
+
# running commands
|
29
|
+
#
|
30
|
+
|
31
|
+
# Make all commands echo before running.
|
32
|
+
def run_verbose!
|
33
|
+
@run_verbose = true
|
34
|
+
end
|
35
|
+
|
36
|
+
# Run a command, raise an error upon failure. Output goes to
|
37
|
+
# $stdout/$stderr.
|
38
|
+
def run(command, args = nil)
|
39
|
+
line = nil
|
40
|
+
if args
|
41
|
+
args = args.map(&:to_s)
|
42
|
+
line = "#{command} #{args.join(" ")}"
|
43
|
+
vputs line
|
44
|
+
system(command, *args)
|
45
|
+
else
|
46
|
+
line = command
|
47
|
+
vputs line
|
48
|
+
system(command)
|
49
|
+
end
|
50
|
+
if $? != 0
|
51
|
+
if $?.termsig == Signal.list["INT"]
|
52
|
+
raise "#{line} interrupted"
|
53
|
+
end
|
54
|
+
raise RunError, "#{line} failed : #{$?.to_i / 256}"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# Like mkdir -p. Optionally, set the owner and mode.
|
59
|
+
def mkdir(dir, owner = nil, mode = nil)
|
60
|
+
FileUtils.mkdir_p(dir, :verbose => verbose?)
|
61
|
+
chmod(dir, mode) if mode
|
62
|
+
chown(dir, owner) if owner
|
63
|
+
end
|
64
|
+
|
65
|
+
# mkdir only if the directory doesn't already exist. Optionally,
|
66
|
+
# set the owner and mode.
|
67
|
+
def mkdir_if_necessary(dir, owner = nil, mode = nil)
|
68
|
+
mkdir(dir, owner, mode) if !(File.exists?(dir) || File.symlink?(dir))
|
69
|
+
end
|
70
|
+
|
71
|
+
# rm a dir and recreate it.
|
72
|
+
def rm_and_mkdir(dir)
|
73
|
+
raise "don't do this" if dir == ""
|
74
|
+
run "rm -rf #{dir} && mkdir -p #{dir}"
|
75
|
+
end
|
76
|
+
|
77
|
+
# Are two files different?
|
78
|
+
def different?(a, b)
|
79
|
+
!FileUtils.compare_file(a, b)
|
80
|
+
end
|
81
|
+
|
82
|
+
# Copy file or dir from src to dst. Optionally, set the mode and
|
83
|
+
# owner of dst.
|
84
|
+
def cp(src, dst, owner = nil, mode = nil)
|
85
|
+
FileUtils.cp_r(src, dst, :preserve => true, :verbose => verbose?)
|
86
|
+
if owner && !File.symlink?(dst)
|
87
|
+
chown(dst, owner)
|
88
|
+
end
|
89
|
+
if mode
|
90
|
+
chmod(dst, mode)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
# Copy file or dir from src to dst, but create the dst directory
|
95
|
+
# first if necessary. Optionally, set the mode and owner of dst.
|
96
|
+
def cp_with_mkdir(src, dst, owner = nil, mode = nil)
|
97
|
+
mkdir_if_necessary(File.dirname(dst))
|
98
|
+
cp(src, dst, owner, mode)
|
99
|
+
end
|
100
|
+
|
101
|
+
# Copy file or dir from src to dst, but ONLY if dst doesn't exist
|
102
|
+
# or has different contents than src. Optionally, set the mode and
|
103
|
+
# owner of dst.
|
104
|
+
def cp_if_necessary(src, dst, owner = nil, mode = nil)
|
105
|
+
if !File.exists?(dst) || different?(src, dst)
|
106
|
+
cp(src, dst, owner, mode)
|
107
|
+
true
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
# Move src to dst. Because this uses FileUtils, it works even if
|
112
|
+
# dst is on a different partition.
|
113
|
+
def mv(src, dst)
|
114
|
+
FileUtils.mv(src, dst, :verbose => verbose?)
|
115
|
+
end
|
116
|
+
|
117
|
+
# Move src to dst, but create the dst directory first if
|
118
|
+
# necessary.
|
119
|
+
def mv_with_mkdir(src, dst)
|
120
|
+
mkdir_if_necessary(File.dirname(dst))
|
121
|
+
mv(src, dst)
|
122
|
+
end
|
123
|
+
|
124
|
+
# Chown file to be owned by user.
|
125
|
+
def chown(file, user)
|
126
|
+
user = user.to_s
|
127
|
+
# who is the current owner?
|
128
|
+
@uids ||= {}
|
129
|
+
@uids[user] ||= Etc.getpwnam(user).uid
|
130
|
+
uid = @uids[user]
|
131
|
+
if File.stat(file).uid != uid
|
132
|
+
run "chown #{user}:#{user} '#{file}'"
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
# Chmod file to a new mode.
|
137
|
+
def chmod(file, mode)
|
138
|
+
if File.stat(file).mode != mode
|
139
|
+
FileUtils.chmod(mode, file, :verbose => verbose?)
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
# rm a file
|
144
|
+
def rm(file)
|
145
|
+
FileUtils.rm(file, :force => true, :verbose => verbose?)
|
146
|
+
end
|
147
|
+
|
148
|
+
# rm a file, but only if it exists.
|
149
|
+
def rm_if_necessary(file)
|
150
|
+
if File.exists?(file)
|
151
|
+
rm(file)
|
152
|
+
true
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
# Create a symlink from src to dst.
|
157
|
+
def ln(src, dst)
|
158
|
+
FileUtils.ln_sf(src, dst, :verbose => verbose?)
|
159
|
+
end
|
160
|
+
|
161
|
+
# Create a symlink from src to dst, but only if it hasn't already
|
162
|
+
# been created.
|
163
|
+
def ln_if_necessary(src, dst)
|
164
|
+
ln = false
|
165
|
+
if !File.symlink?(dst)
|
166
|
+
ln = true
|
167
|
+
elsif File.readlink(dst) != src
|
168
|
+
rm(dst)
|
169
|
+
ln = true
|
170
|
+
end
|
171
|
+
if ln
|
172
|
+
ln(src, dst)
|
173
|
+
true
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
# Touch a file
|
178
|
+
def touch(file)
|
179
|
+
FileUtils.touch(file)
|
180
|
+
end
|
181
|
+
|
182
|
+
# A nice printout in green.
|
183
|
+
def banner(s, color = GREEN)
|
184
|
+
s = "#{s} ".ljust(72, " ")
|
185
|
+
$stderr.write "#{color}[#{Time.new.strftime('%H:%M:%S')}] #{s}#{RESET}\n"
|
186
|
+
$stderr.flush
|
187
|
+
end
|
188
|
+
|
189
|
+
# Print a warning in yellow.
|
190
|
+
def warning(msg)
|
191
|
+
banner("Warning: #{msg}", YELLOW)
|
192
|
+
end
|
193
|
+
|
194
|
+
# Print a fatal error in red, then exit.
|
195
|
+
def fatal(msg)
|
196
|
+
banner(msg, RED)
|
197
|
+
exit(1)
|
198
|
+
end
|
199
|
+
|
200
|
+
# Generate some random text
|
201
|
+
def random_text(len)
|
202
|
+
chars = ("A".."Z").to_a + ("a".."z").to_a + ("0".."9").to_a
|
203
|
+
(1..len).map { chars[rand(chars.length - 1)] }.join("")
|
204
|
+
end
|
205
|
+
|
206
|
+
# Convert a string into something that could be a path segment
|
207
|
+
def pathify(s)
|
208
|
+
s = s.gsub(/^\//, "")
|
209
|
+
s = s.gsub("..", ",")
|
210
|
+
s = s.gsub(/[?\/&]/, ",")
|
211
|
+
s = s.gsub(/[^A-Za-z0-9_.,=-]/) do |i|
|
212
|
+
hex = i.unpack("H2").first
|
213
|
+
"%#{hex}"
|
214
|
+
end
|
215
|
+
s = "_root_" if s.empty?
|
216
|
+
s = s.downcase
|
217
|
+
s
|
218
|
+
end
|
219
|
+
|
220
|
+
# checksum some text
|
221
|
+
def md5(s)
|
222
|
+
Digest::MD5.hexdigest(s.to_s)
|
223
|
+
end
|
224
|
+
|
225
|
+
private
|
226
|
+
|
227
|
+
# Returns true if verbosity is turned on.
|
228
|
+
def verbose?
|
229
|
+
@run_verbose ||= nil
|
230
|
+
end
|
231
|
+
|
232
|
+
def vputs(s)
|
233
|
+
$stderr.puts s if verbose?
|
234
|
+
end
|
235
|
+
end
|
236
|
+
end
|
data/sample.sinew
ADDED
@@ -0,0 +1,8 @@
|
|
1
|
+
get "http://www.amazon.com/gp/bestsellers/books/ref=sv_b_3"
|
2
|
+
noko.css(".zg_itemRow").each do |item|
|
3
|
+
row = { }
|
4
|
+
row[:url] = item.css(".zg_title a").first[:href]
|
5
|
+
row[:title] = item.css(".zg_title")
|
6
|
+
row[:img] = item.css(".zg_itemImage_normal img").first[:src]
|
7
|
+
csv_emit(row)
|
8
|
+
end
|
data/sinew.gemspec
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
$LOAD_PATH << File.expand_path("../lib", __FILE__)
|
2
|
+
|
3
|
+
require "sinew/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "sinew"
|
7
|
+
s.version = Sinew::VERSION
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.authors = ["Adam Doppelt"]
|
10
|
+
s.email = ["amd@gurge.com"]
|
11
|
+
s.homepage = "http://github.com/gurgeous/sinew"
|
12
|
+
s.summary = "Sinew - structured web crawling using recipes."
|
13
|
+
s.description = "Crawl web sites easily using ruby recipes, with caching and nokogiri."
|
14
|
+
|
15
|
+
s.rubyforge_project = "sinew"
|
16
|
+
|
17
|
+
s.add_runtime_dependency "activesupport"
|
18
|
+
s.add_runtime_dependency "awesome_print"
|
19
|
+
s.add_runtime_dependency "htmlentities"
|
20
|
+
s.add_runtime_dependency "nokogiri"
|
21
|
+
s.add_runtime_dependency "stringex"
|
22
|
+
s.add_runtime_dependency "trollop"
|
23
|
+
s.add_development_dependency "rake"
|
24
|
+
|
25
|
+
s.files = `git ls-files`.split("\n")
|
26
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
27
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
28
|
+
s.require_paths = ["lib"]
|
29
|
+
end
|
metadata
ADDED
@@ -0,0 +1,179 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: sinew
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Adam Doppelt
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-06-04 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: activesupport
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: awesome_print
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: htmlentities
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: nokogiri
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :runtime
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: stringex
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
type: :runtime
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
94
|
+
- !ruby/object:Gem::Dependency
|
95
|
+
name: trollop
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
type: :runtime
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
110
|
+
- !ruby/object:Gem::Dependency
|
111
|
+
name: rake
|
112
|
+
requirement: !ruby/object:Gem::Requirement
|
113
|
+
none: false
|
114
|
+
requirements:
|
115
|
+
- - ! '>='
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
122
|
+
requirements:
|
123
|
+
- - ! '>='
|
124
|
+
- !ruby/object:Gem::Version
|
125
|
+
version: '0'
|
126
|
+
description: Crawl web sites easily using ruby recipes, with caching and nokogiri.
|
127
|
+
email:
|
128
|
+
- amd@gurge.com
|
129
|
+
executables:
|
130
|
+
- sinew
|
131
|
+
extensions: []
|
132
|
+
extra_rdoc_files: []
|
133
|
+
files:
|
134
|
+
- .gitignore
|
135
|
+
- Gemfile
|
136
|
+
- LICENSE
|
137
|
+
- README.md
|
138
|
+
- Rakefile
|
139
|
+
- bin/sinew
|
140
|
+
- lib/sinew.rb
|
141
|
+
- lib/sinew/curler.rb
|
142
|
+
- lib/sinew/main.rb
|
143
|
+
- lib/sinew/nokogiri_ext.rb
|
144
|
+
- lib/sinew/text_util.rb
|
145
|
+
- lib/sinew/util.rb
|
146
|
+
- lib/sinew/version.rb
|
147
|
+
- sample.sinew
|
148
|
+
- sinew.gemspec
|
149
|
+
homepage: http://github.com/gurgeous/sinew
|
150
|
+
licenses: []
|
151
|
+
post_install_message:
|
152
|
+
rdoc_options: []
|
153
|
+
require_paths:
|
154
|
+
- lib
|
155
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
156
|
+
none: false
|
157
|
+
requirements:
|
158
|
+
- - ! '>='
|
159
|
+
- !ruby/object:Gem::Version
|
160
|
+
version: '0'
|
161
|
+
segments:
|
162
|
+
- 0
|
163
|
+
hash: 106543959769779396
|
164
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
165
|
+
none: false
|
166
|
+
requirements:
|
167
|
+
- - ! '>='
|
168
|
+
- !ruby/object:Gem::Version
|
169
|
+
version: '0'
|
170
|
+
segments:
|
171
|
+
- 0
|
172
|
+
hash: 106543959769779396
|
173
|
+
requirements: []
|
174
|
+
rubyforge_project: sinew
|
175
|
+
rubygems_version: 1.8.21
|
176
|
+
signing_key:
|
177
|
+
specification_version: 3
|
178
|
+
summary: Sinew - structured web crawling using recipes.
|
179
|
+
test_files: []
|