sinew 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,6 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
5
+ rdoc
6
+ sample.csv
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source "http://rubygems.org"
2
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2012 Adam Doppelt
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,34 @@
1
+ ## Welcome to Sinew
2
+
3
+ Sinew collects structured data from web sites (screen scraping). It provides a Ruby DSL built for crawling, a robust caching system, and integration with [Nokogiri](http://nokogiri.org). Though small, this project is the culmination of years of effort based on crawling systems built at several different companies.
4
+
5
+ Sinew requires Ruby 1.9, [HTML Tidy](http://tidy.sourceforge.net) and [Curl](http://curl.haxx.se).
6
+
7
+ ## Example
8
+
9
+ Here's an example for collecting Amazon's bestseller list:
10
+
11
+ ```ruby
12
+ # get the url
13
+ get "http://www.amazon.com/gp/bestsellers/books/ref=sv_b_3"
14
+
15
+ # use nokogiri to find books
16
+ noko.css(".zg_itemRow").each do |item|
17
+ # pull out the stuff we care about using nokogiri
18
+ row = { }
19
+ row[:url] = item.css(".zg_title a").first[:href]
20
+ row[:title] = item.css(".zg_title")
21
+ row[:img] = item.css(".zg_itemImage_normal img").first[:src]
22
+
23
+ # append a row to the csv
24
+ csv_emit(row)
25
+ end
26
+ ```
27
+
28
+ If you paste this into a file called `bestsellers.sinew` and run `sinew bestsellers.sinew`, it will create a `bestsellers.csv` file containing the url, title and img for each bestseller.
29
+
30
+ ## Full Documentation
31
+
32
+ Full docs are in the wiki:
33
+
34
+ https://github.com/gurgeous/sinew/wiki
@@ -0,0 +1,30 @@
1
+ require "bundler"
2
+ require "bundler/setup"
3
+ require "rake"
4
+
5
+ $LOAD_PATH << File.expand_path("../lib", __FILE__)
6
+ require "sinew/version"
7
+
8
+ #
9
+ # gem
10
+ #
11
+
12
+ task :gem => :build
13
+ task :build do
14
+ system "gem build --quiet sinew.gemspec"
15
+ end
16
+
17
+ task :install => :build do
18
+ system "sudo gem install --quiet sinew-#{Sinew::VERSION}.gem"
19
+ end
20
+
21
+ task :release => :build do
22
+ system "git tag -a #{Sinew::VERSION} -m 'Tagging #{Sinew::VERSION}'"
23
+ system "git push --tags"
24
+ system "gem push sinew-#{Sinew::VERSION}.gem"
25
+ end
26
+
27
+ task :default => :gem
28
+
29
+ # to test:
30
+ # block ; rake install && rm -rf ~/.sinew/www.amazon.com && /usr/local/bin/sinew sample.sinew
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "sinew"
4
+ require "trollop"
5
+
6
+ # ARGV
7
+ options = Trollop.options do
8
+ banner "Usage: sinew [options] <gub.sinew>"
9
+ opt :verbose, "Dump every row"
10
+ end
11
+ Trollop.die "need a .sinew file to run against" if ARGV.blank?
12
+
13
+ # now run!
14
+ ARGV.each do |i|
15
+ Sinew::Main.new(options.merge(file: i))
16
+ end
@@ -0,0 +1,6 @@
1
+ require "sinew/version"
2
+ require "sinew/util"
3
+ require "sinew/curler"
4
+ require "sinew/nokogiri_ext"
5
+ require "sinew/text_util"
6
+ require "sinew/main"
@@ -0,0 +1,173 @@
1
+ require "uri"
2
+
3
+ module Sinew
4
+ class Curler
5
+ class Error < StandardError ; end
6
+
7
+ DEFAULT_OPTIONS = {
8
+ :cache_errors => true,
9
+ :max_time => 30,
10
+ :retry => 3,
11
+ :verbose => true,
12
+ }
13
+
14
+ attr_reader :url, :uri, :root
15
+
16
+ def initialize(options = {})
17
+ @options = DEFAULT_OPTIONS.merge(options)
18
+ @curl_args = ["--silent", "--fail", "--user-agent", @options[:user_agent], "--max-time", @options[:max_time], "--retry", @options[:retry], "--location", "--max-redirs", "3"]
19
+ @last_request = Time.at(0)
20
+
21
+ @root = @options[:dir]
22
+ if !@root
23
+ if File.exists?(ENV["HOME"]) && File.stat(ENV["HOME"]).writable?
24
+ @root = "#{ENV["HOME"]}/.sinew"
25
+ else
26
+ @root = "/tmp/sinew"
27
+ end
28
+ end
29
+ end
30
+
31
+ def get(url)
32
+ curl(url, nil)
33
+ end
34
+
35
+ def post(url, body)
36
+ curl(url, body)
37
+ end
38
+
39
+ def curl(url, body)
40
+ #
41
+ # prepare url/uri and calculate paths
42
+ #
43
+
44
+ @uri = url.is_a?(URI) ? url : Curler.url_to_uri(url.to_s)
45
+ @url = @uri.to_s
46
+
47
+ path = fullpath(@uri)
48
+ path = "#{path},#{Util.pathify(body)}" if body
49
+
50
+ # shorten long paths
51
+ if path.length > 250
52
+ dir, base = File.dirname(path), File.basename(path)
53
+ path = "#{dir}/#{Util.md5(base)}"
54
+ end
55
+
56
+ head = "#{File.dirname(path)}/head/#{File.basename(path)}"
57
+
58
+ if !File.exists?(path)
59
+ verbose(body ? "curl #{@url} (POST)" : "curl #{@url}")
60
+ tmp = "/tmp/curler_#{Util.random_text(6)}"
61
+ tmph = "#{tmp}.head"
62
+ begin
63
+ rate_limit
64
+ Util.mkdir_if_necessary(File.dirname(path))
65
+ Util.mkdir_if_necessary(File.dirname(head))
66
+ begin
67
+ command = []
68
+ command += @curl_args
69
+ if body
70
+ command += ["--data-binary", body]
71
+ command += ["--header", "Content-Type: application/x-www-form-urlencoded"]
72
+ end
73
+ command += ["--output", tmp]
74
+ command += ["--dump-header", tmph]
75
+ command << @url
76
+
77
+ Util.run("curl", command)
78
+
79
+ # empty response?
80
+ if !File.exists?(tmp)
81
+ Util.touch(tmp)
82
+ Util.touch(tmph)
83
+ end
84
+ rescue Util::RunError => e
85
+ message = "curl error"
86
+ if e.message =~ /(\d+)$/
87
+ message = "#{message} (#{$1})"
88
+ end
89
+
90
+ # cache the error?
91
+ if @options[:cache_errors]
92
+ File.open(path, "w") { |f| f.puts "" }
93
+ File.open(head, "w") { |f| f.puts "CURLER_ERROR\t#{message}" }
94
+ end
95
+
96
+ raise Error, message
97
+ end
98
+ Util.mv(tmp, path)
99
+ Util.mv(tmph, head)
100
+ ensure
101
+ Util.rm_if_necessary(tmp)
102
+ Util.rm_if_necessary(tmph)
103
+ end
104
+ end
105
+
106
+ #
107
+ # handle redirects (recalculate @uri/@url)
108
+ #
109
+
110
+ if File.exists?(head)
111
+ head_contents = File.read(head)
112
+ # handle cached errors
113
+ if head_contents =~ /^CURLER_ERROR\t(.*)/
114
+ raise Error, $1
115
+ end
116
+ original = @uri
117
+ head_contents.scan(/\A(HTTP\/\d\.\d (\d+).*?\r\n\r\n)/m) do |i|
118
+ headers, code = $1, $2
119
+ if code =~ /^3/
120
+ if redir = headers[/^Location: ([^\r\n]+)/, 1]
121
+ @uri += redir
122
+ @url = @uri.to_s
123
+ end
124
+ end
125
+ end
126
+ # kill unnecessary head files
127
+ if original == @uri
128
+ Util.rm(head)
129
+ end
130
+ end
131
+
132
+ path
133
+ end
134
+
135
+ def verbose(s)
136
+ $stderr.puts s if @options[:verbose]
137
+ end
138
+
139
+ #
140
+ # helpers
141
+ #
142
+
143
+ def fullpath(uri)
144
+ "#{@root}/#{Curler.uri_to_path(uri)}"
145
+ end
146
+
147
+ def uncache!(url)
148
+ Util.rm_if_necessary("#{@root}/#{Curler.url_to_path(url)}")
149
+ end
150
+
151
+ def self.url_to_uri(url)
152
+ url = url.gsub(" ", "%20")
153
+ url = url.gsub("'", "%27")
154
+ URI.parse(url)
155
+ end
156
+
157
+ def self.url_to_path(url)
158
+ uri_to_path(url_to_uri(url))
159
+ end
160
+
161
+ def self.uri_to_path(uri)
162
+ s = uri.path
163
+ s = "#{s}?#{uri.query}" if uri.query
164
+ "#{Util.pathify(uri.host)}/#{Util.pathify(s)}"
165
+ end
166
+
167
+ def rate_limit
168
+ sleep = (@last_request + 1) - Time.now
169
+ sleep(sleep) if sleep > 0
170
+ @last_request = Time.now
171
+ end
172
+ end
173
+ end
@@ -0,0 +1,172 @@
1
+ require "nokogiri" # must be loaded before awesome_print
2
+ require "awesome_print"
3
+ require "cgi"
4
+ require "csv"
5
+ require "htmlentities"
6
+ require "stringex"
7
+
8
+ module Sinew
9
+ class Main
10
+ CODER = HTMLEntities.new
11
+
12
+ attr_accessor :url, :uri, :raw
13
+
14
+ def initialize(options)
15
+ @options = options.dup
16
+ @csv = @path = nil
17
+
18
+ @curler = Curler.new(user_agent: "sinew/#{VERSION}")
19
+
20
+ file = @options[:file]
21
+ if !File.exists?(file)
22
+ Util.fatal("#{file} not found")
23
+ end
24
+
25
+ tm = Time.now
26
+ instance_eval(File.read(file, mode: "rb"), file)
27
+ if @path
28
+ Util.banner("Finished #{@path} in #{(Time.now - tm).to_i}s.")
29
+ else
30
+ Util.banner("Finished in #{(Time.now - tm).to_i}s.")
31
+ end
32
+ end
33
+
34
+ def get(url, params = nil)
35
+ http(url, params, :get)
36
+ end
37
+
38
+ def post(url, params = nil)
39
+ http(url, params, :post)
40
+ end
41
+
42
+ def http(url, params, method)
43
+ url = url.to_s
44
+ raise "invalid url #{url.inspect}" if url !~ /^http/i
45
+
46
+ # decode entities
47
+ url = CODER.decode(url)
48
+
49
+ # handle params
50
+ body = nil
51
+ if params
52
+ q = params.map { |key, value| [CGI.escape(key.to_s), CGI.escape(value.to_s)] }.sort
53
+ q = q.map { |key, value| "#{key}=#{value}" }.join("&")
54
+ if method == :get
55
+ separator = url.include?(??) ? "&" : "?"
56
+ url = "#{url}#{separator}#{q}"
57
+ else
58
+ body = q
59
+ end
60
+ end
61
+
62
+ begin
63
+ if method == :get
64
+ path = @curler.get(url)
65
+ else
66
+ path = @curler.post(url, body)
67
+ end
68
+ @raw = File.read(path, mode: "rb")
69
+ rescue Curler::Error => e
70
+ $stderr.puts "xxx #{e.message}"
71
+ @raw = ""
72
+ end
73
+
74
+ # setup local variables
75
+ @url, @uri = @curler.url, @curler.uri
76
+ @html = nil
77
+ @clean = nil
78
+ @noko = nil
79
+
80
+ nil
81
+ end
82
+
83
+ #
84
+ # lazy accessors for cleaned up version
85
+ #
86
+
87
+ def html
88
+ @html ||= begin
89
+ s = TextUtil.html_tidy(@raw)
90
+ nelements = @raw.count("<")
91
+ if nelements > 1
92
+ # is there a problem with tidy?
93
+ percent = 100 * s.count("<") / nelements
94
+ if percent < 80
95
+ # bad xml processing instruction? Try fixing it.
96
+ maybe = TextUtil.html_tidy(@raw.gsub(/<\?[^>]*?>/, ""))
97
+ new_percent = 100 * maybe.count("<") / nelements
98
+ if new_percent > 80
99
+ # yes!
100
+ s = maybe
101
+ else
102
+ Util.warning "Hm - it looks like tidy ate some of your file (#{percent}%)" if percent < 90
103
+ end
104
+ end
105
+ end
106
+ s
107
+ end
108
+ end
109
+
110
+ def clean
111
+ @clean ||= TextUtil.html_clean_from_tidy(self.html)
112
+ end
113
+
114
+ def noko
115
+ @noko ||= Nokogiri::HTML(html)
116
+ end
117
+
118
+ #
119
+ # csv
120
+ #
121
+
122
+ def csv_header(*args)
123
+ args = args.flatten
124
+ if args.first.is_a?(String)
125
+ file = args.shift
126
+ if file !~ /^\//
127
+ file = "#{File.dirname(@options[:file])}/#{file}"
128
+ end
129
+ else
130
+ file = @options[:file]
131
+ end
132
+ ext = File.extname(file)
133
+ file = ext.empty? ? "#{file}.csv" : file.gsub(ext, ".csv")
134
+
135
+ @path = file
136
+ @csv = CSV.open(file, "wb")
137
+ @csv_keys = args
138
+ @csv << @csv_keys
139
+ Util.banner("Writing to #{@path}...")
140
+ end
141
+
142
+ def normalize(key, s)
143
+ case s
144
+ when Nokogiri::XML::Element, Nokogiri::XML::NodeSet
145
+ s = s.inner_html
146
+ when Array
147
+ s = s.map { |j| j.to_s }.join("|")
148
+ else
149
+ s = s.to_s
150
+ end
151
+ s = TextUtil.untag(s)
152
+ s = s.convert_accented_entities
153
+ s = TextUtil.unent(s)
154
+ s = s.to_ascii.squish
155
+ s
156
+ end
157
+
158
+ def csv_emit(row, options = {})
159
+ csv_header(row.keys.sort) if !@csv
160
+
161
+ print = { }
162
+ row = @csv_keys.map do |i|
163
+ s = normalize(i, row[i])
164
+ print[i] = s if !s.empty?
165
+ s
166
+ end
167
+ $stderr.puts print.ai if @options[:verbose]
168
+ @csv << row
169
+ @csv.flush
170
+ end
171
+ end
172
+ end
@@ -0,0 +1,27 @@
1
+ require "nokogiri"
2
+
3
+ # modify NodeSet to join with SPACE instead of empty string
4
+ class Nokogiri::XML::NodeSet
5
+ alias :old_inner_html :inner_html
6
+ alias :old_inner_text :inner_text
7
+
8
+ def inner_text
9
+ collect { |i| i.inner_text }.join(" ")
10
+ end
11
+ def inner_html *args
12
+ collect { |i| i.inner_html(*args) }.join(" ")
13
+ end
14
+ end
15
+
16
+ # text_just_me
17
+ class Nokogiri::XML::Node
18
+ def text_just_me
19
+ t = children.find { |i| i.node_type == Nokogiri::XML::Node::TEXT_NODE }
20
+ t && t.text
21
+ end
22
+ end
23
+ class Nokogiri::XML::NodeSet
24
+ def text_just_me
25
+ map { |i| i.text_just_me }.join(" ")
26
+ end
27
+ end
@@ -0,0 +1,103 @@
1
+ require "active_support/core_ext"
2
+ require "set"
3
+
4
+ module Sinew
5
+ module TextUtil
6
+ extend self
7
+
8
+ ATTRS_KEEP = Set.new %w(a img iframe)
9
+ TIDY_OPTIONS = {
10
+ "-asxml" => nil,
11
+ "-bare" => nil,
12
+ "-quiet" => nil,
13
+ "-utf8" => nil,
14
+ "-wrap" => 0,
15
+ "--doctype" => "omit",
16
+ "--hide-comments" => "yes",
17
+ "--numeric-entities" => "no",
18
+ "--preserve-entities" => "yes",
19
+ "--force-output" => "yes",
20
+ "-f" => "/dev/null",
21
+ }
22
+
23
+ XML_ENTITIES = { "&"=>"&amp;", "<"=>"&lt;", ">"=>"&gt;", "'"=>"&apos;", '"'=>"&quot;" }
24
+ XML_ENTITIES_INV = XML_ENTITIES.invert
25
+ COMMON_ENTITIES_INV = XML_ENTITIES_INV.merge(
26
+ "&frac12;" => "1/2",
27
+ "&frac14;" => "1/4",
28
+ "&frac34;" => "3/4",
29
+ "&ldquo;" => '"',
30
+ "&lsquo;" => "'",
31
+ "&mdash;" => "-",
32
+ "&nbsp;" => " ",
33
+ "&ndash;" => "-",
34
+ "&rdquo;" => '"',
35
+ "&rsquo;" => "'",
36
+ "&tilde;" => "~",
37
+ "&#34;" => '"',
38
+ "&#39;" => "'",
39
+ "&#160;" => " ",
40
+ "&#8232;" => "\n"
41
+ )
42
+
43
+ #
44
+ # tidy/clean
45
+ #
46
+
47
+ def html_tidy(s)
48
+ # run tidy
49
+ args = TIDY_OPTIONS.map { |k, v| "#{k} #{v}" }.join(" ")
50
+ s = IO.popen("tidy #{args}", "rb+") do |f|
51
+ f.write(s)
52
+ f.close_write
53
+ f.read
54
+ end
55
+ raise "could not run tidy" if ($? >> 8) > 2
56
+
57
+ # now kill some tags
58
+ s.sub!(/<html\b[^>]+>/, "<html>")
59
+ s.gsub!(/<\/?(meta|link)\b[^>]*>/m, "")
60
+ s.gsub!(/<(style|script)\b[^>]*(\/>|>.*?<\/\1\b>)/m, "")
61
+ s.gsub!(/<\?[^>]*>/m, "")
62
+ s.squish!
63
+
64
+ # kill whitespace around tags
65
+ s.gsub!(/ ?<([^>]+)> ?/, "<\\1>")
66
+
67
+ s
68
+ end
69
+
70
+ def html_clean(s)
71
+ html_clean_from_tidy(html_tidy(s))
72
+ end
73
+
74
+ def html_clean_from_tidy(s)
75
+ # then kill most attrs
76
+ s = s.dup
77
+ s.gsub!(/<([^\s>]+)[^>]*?(\/)?>/) do |i|
78
+ ATTRS_KEEP.include?($1) ? i : "<#{$1}#{$2}>"
79
+ end
80
+ s
81
+ end
82
+
83
+ #
84
+ # untag/unent
85
+ #
86
+
87
+ def xml_escape(s)
88
+ s.gsub(/[&<>'"]/) { |i| XML_ENTITIES[i] }
89
+ end
90
+
91
+ def xml_unescape(s)
92
+ s.gsub(/&(amp|lt|gt|apos|quot);/) { |i| XML_ENTITIES_INV[i] }
93
+ end
94
+
95
+ def untag(s)
96
+ s.gsub(/<[^>]+>/, " ")
97
+ end
98
+
99
+ def unent(s)
100
+ s.gsub(/&#?[a-z0-9]{2,};/) { |i| COMMON_ENTITIES_INV[i] }
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,236 @@
1
+ require "digest/md5"
2
+ require "etc"
3
+ require "fileutils"
4
+
5
+ module Sinew
6
+ # Helper module for executing commands and printing stuff
7
+ # out.
8
+ #
9
+ # The general idea is to only print commands that are actually
10
+ # interesting. For example, mkdir_if_necessary won't print anything
11
+ # if the directory already exists. That way we can scan output and
12
+ # see what changes were made without getting lost in repetitive
13
+ # commands that had no actual effect.
14
+ module Util
15
+ class RunError < StandardError ; end
16
+
17
+ extend self
18
+
19
+ RESET = "\e[0m"
20
+ RED = "\e[1;37;41m"
21
+ GREEN = "\e[1;37;42m"
22
+ YELLOW = "\e[1;37;43m"
23
+ BLUE = "\e[1;37;44m"
24
+ MAGENTA = "\e[1;37;45m"
25
+ CYAN = "\e[1;37;46m"
26
+
27
+ #
28
+ # running commands
29
+ #
30
+
31
+ # Make all commands echo before running.
32
+ def run_verbose!
33
+ @run_verbose = true
34
+ end
35
+
36
+ # Run a command, raise an error upon failure. Output goes to
37
+ # $stdout/$stderr.
38
+ def run(command, args = nil)
39
+ line = nil
40
+ if args
41
+ args = args.map(&:to_s)
42
+ line = "#{command} #{args.join(" ")}"
43
+ vputs line
44
+ system(command, *args)
45
+ else
46
+ line = command
47
+ vputs line
48
+ system(command)
49
+ end
50
+ if $? != 0
51
+ if $?.termsig == Signal.list["INT"]
52
+ raise "#{line} interrupted"
53
+ end
54
+ raise RunError, "#{line} failed : #{$?.to_i / 256}"
55
+ end
56
+ end
57
+
58
+ # Like mkdir -p. Optionally, set the owner and mode.
59
+ def mkdir(dir, owner = nil, mode = nil)
60
+ FileUtils.mkdir_p(dir, :verbose => verbose?)
61
+ chmod(dir, mode) if mode
62
+ chown(dir, owner) if owner
63
+ end
64
+
65
+ # mkdir only if the directory doesn't already exist. Optionally,
66
+ # set the owner and mode.
67
+ def mkdir_if_necessary(dir, owner = nil, mode = nil)
68
+ mkdir(dir, owner, mode) if !(File.exists?(dir) || File.symlink?(dir))
69
+ end
70
+
71
+ # rm a dir and recreate it.
72
+ def rm_and_mkdir(dir)
73
+ raise "don't do this" if dir == ""
74
+ run "rm -rf #{dir} && mkdir -p #{dir}"
75
+ end
76
+
77
+ # Are two files different?
78
+ def different?(a, b)
79
+ !FileUtils.compare_file(a, b)
80
+ end
81
+
82
+ # Copy file or dir from src to dst. Optionally, set the mode and
83
+ # owner of dst.
84
+ def cp(src, dst, owner = nil, mode = nil)
85
+ FileUtils.cp_r(src, dst, :preserve => true, :verbose => verbose?)
86
+ if owner && !File.symlink?(dst)
87
+ chown(dst, owner)
88
+ end
89
+ if mode
90
+ chmod(dst, mode)
91
+ end
92
+ end
93
+
94
+ # Copy file or dir from src to dst, but create the dst directory
95
+ # first if necessary. Optionally, set the mode and owner of dst.
96
+ def cp_with_mkdir(src, dst, owner = nil, mode = nil)
97
+ mkdir_if_necessary(File.dirname(dst))
98
+ cp(src, dst, owner, mode)
99
+ end
100
+
101
+ # Copy file or dir from src to dst, but ONLY if dst doesn't exist
102
+ # or has different contents than src. Optionally, set the mode and
103
+ # owner of dst.
104
+ def cp_if_necessary(src, dst, owner = nil, mode = nil)
105
+ if !File.exists?(dst) || different?(src, dst)
106
+ cp(src, dst, owner, mode)
107
+ true
108
+ end
109
+ end
110
+
111
+ # Move src to dst. Because this uses FileUtils, it works even if
112
+ # dst is on a different partition.
113
+ def mv(src, dst)
114
+ FileUtils.mv(src, dst, :verbose => verbose?)
115
+ end
116
+
117
+ # Move src to dst, but create the dst directory first if
118
+ # necessary.
119
+ def mv_with_mkdir(src, dst)
120
+ mkdir_if_necessary(File.dirname(dst))
121
+ mv(src, dst)
122
+ end
123
+
124
+ # Chown file to be owned by user.
125
+ def chown(file, user)
126
+ user = user.to_s
127
+ # who is the current owner?
128
+ @uids ||= {}
129
+ @uids[user] ||= Etc.getpwnam(user).uid
130
+ uid = @uids[user]
131
+ if File.stat(file).uid != uid
132
+ run "chown #{user}:#{user} '#{file}'"
133
+ end
134
+ end
135
+
136
+ # Chmod file to a new mode.
137
+ def chmod(file, mode)
138
+ if File.stat(file).mode != mode
139
+ FileUtils.chmod(mode, file, :verbose => verbose?)
140
+ end
141
+ end
142
+
143
+ # rm a file
144
+ def rm(file)
145
+ FileUtils.rm(file, :force => true, :verbose => verbose?)
146
+ end
147
+
148
+ # rm a file, but only if it exists.
149
+ def rm_if_necessary(file)
150
+ if File.exists?(file)
151
+ rm(file)
152
+ true
153
+ end
154
+ end
155
+
156
+ # Create a symlink from src to dst.
157
+ def ln(src, dst)
158
+ FileUtils.ln_sf(src, dst, :verbose => verbose?)
159
+ end
160
+
161
+ # Create a symlink from src to dst, but only if it hasn't already
162
+ # been created.
163
+ def ln_if_necessary(src, dst)
164
+ ln = false
165
+ if !File.symlink?(dst)
166
+ ln = true
167
+ elsif File.readlink(dst) != src
168
+ rm(dst)
169
+ ln = true
170
+ end
171
+ if ln
172
+ ln(src, dst)
173
+ true
174
+ end
175
+ end
176
+
177
+ # Touch a file
178
+ def touch(file)
179
+ FileUtils.touch(file)
180
+ end
181
+
182
+ # A nice printout in green.
183
+ def banner(s, color = GREEN)
184
+ s = "#{s} ".ljust(72, " ")
185
+ $stderr.write "#{color}[#{Time.new.strftime('%H:%M:%S')}] #{s}#{RESET}\n"
186
+ $stderr.flush
187
+ end
188
+
189
+ # Print a warning in yellow.
190
+ def warning(msg)
191
+ banner("Warning: #{msg}", YELLOW)
192
+ end
193
+
194
+ # Print a fatal error in red, then exit.
195
+ def fatal(msg)
196
+ banner(msg, RED)
197
+ exit(1)
198
+ end
199
+
200
+ # Generate some random text
201
+ def random_text(len)
202
+ chars = ("A".."Z").to_a + ("a".."z").to_a + ("0".."9").to_a
203
+ (1..len).map { chars[rand(chars.length - 1)] }.join("")
204
+ end
205
+
206
+ # Convert a string into something that could be a path segment
207
+ def pathify(s)
208
+ s = s.gsub(/^\//, "")
209
+ s = s.gsub("..", ",")
210
+ s = s.gsub(/[?\/&]/, ",")
211
+ s = s.gsub(/[^A-Za-z0-9_.,=-]/) do |i|
212
+ hex = i.unpack("H2").first
213
+ "%#{hex}"
214
+ end
215
+ s = "_root_" if s.empty?
216
+ s = s.downcase
217
+ s
218
+ end
219
+
220
+ # checksum some text
221
+ def md5(s)
222
+ Digest::MD5.hexdigest(s.to_s)
223
+ end
224
+
225
+ private
226
+
227
+ # Returns true if verbosity is turned on.
228
+ def verbose?
229
+ @run_verbose ||= nil
230
+ end
231
+
232
+ def vputs(s)
233
+ $stderr.puts s if verbose?
234
+ end
235
+ end
236
+ end
@@ -0,0 +1,4 @@
1
+ module Sinew
2
+ # Gem version
3
+ VERSION = "1.0.0"
4
+ end
@@ -0,0 +1,8 @@
1
+ get "http://www.amazon.com/gp/bestsellers/books/ref=sv_b_3"
2
+ noko.css(".zg_itemRow").each do |item|
3
+ row = { }
4
+ row[:url] = item.css(".zg_title a").first[:href]
5
+ row[:title] = item.css(".zg_title")
6
+ row[:img] = item.css(".zg_itemImage_normal img").first[:src]
7
+ csv_emit(row)
8
+ end
@@ -0,0 +1,29 @@
1
+ $LOAD_PATH << File.expand_path("../lib", __FILE__)
2
+
3
+ require "sinew/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "sinew"
7
+ s.version = Sinew::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["Adam Doppelt"]
10
+ s.email = ["amd@gurge.com"]
11
+ s.homepage = "http://github.com/gurgeous/sinew"
12
+ s.summary = "Sinew - structured web crawling using recipes."
13
+ s.description = "Crawl web sites easily using ruby recipes, with caching and nokogiri."
14
+
15
+ s.rubyforge_project = "sinew"
16
+
17
+ s.add_runtime_dependency "activesupport"
18
+ s.add_runtime_dependency "awesome_print"
19
+ s.add_runtime_dependency "htmlentities"
20
+ s.add_runtime_dependency "nokogiri"
21
+ s.add_runtime_dependency "stringex"
22
+ s.add_runtime_dependency "trollop"
23
+ s.add_development_dependency "rake"
24
+
25
+ s.files = `git ls-files`.split("\n")
26
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
27
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
28
+ s.require_paths = ["lib"]
29
+ end
metadata ADDED
@@ -0,0 +1,179 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sinew
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Adam Doppelt
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-06-04 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: activesupport
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: awesome_print
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: htmlentities
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: nokogiri
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: stringex
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :runtime
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ - !ruby/object:Gem::Dependency
95
+ name: trollop
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :runtime
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ - !ruby/object:Gem::Dependency
111
+ name: rake
112
+ requirement: !ruby/object:Gem::Requirement
113
+ none: false
114
+ requirements:
115
+ - - ! '>='
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ none: false
122
+ requirements:
123
+ - - ! '>='
124
+ - !ruby/object:Gem::Version
125
+ version: '0'
126
+ description: Crawl web sites easily using ruby recipes, with caching and nokogiri.
127
+ email:
128
+ - amd@gurge.com
129
+ executables:
130
+ - sinew
131
+ extensions: []
132
+ extra_rdoc_files: []
133
+ files:
134
+ - .gitignore
135
+ - Gemfile
136
+ - LICENSE
137
+ - README.md
138
+ - Rakefile
139
+ - bin/sinew
140
+ - lib/sinew.rb
141
+ - lib/sinew/curler.rb
142
+ - lib/sinew/main.rb
143
+ - lib/sinew/nokogiri_ext.rb
144
+ - lib/sinew/text_util.rb
145
+ - lib/sinew/util.rb
146
+ - lib/sinew/version.rb
147
+ - sample.sinew
148
+ - sinew.gemspec
149
+ homepage: http://github.com/gurgeous/sinew
150
+ licenses: []
151
+ post_install_message:
152
+ rdoc_options: []
153
+ require_paths:
154
+ - lib
155
+ required_ruby_version: !ruby/object:Gem::Requirement
156
+ none: false
157
+ requirements:
158
+ - - ! '>='
159
+ - !ruby/object:Gem::Version
160
+ version: '0'
161
+ segments:
162
+ - 0
163
+ hash: 106543959769779396
164
+ required_rubygems_version: !ruby/object:Gem::Requirement
165
+ none: false
166
+ requirements:
167
+ - - ! '>='
168
+ - !ruby/object:Gem::Version
169
+ version: '0'
170
+ segments:
171
+ - 0
172
+ hash: 106543959769779396
173
+ requirements: []
174
+ rubyforge_project: sinew
175
+ rubygems_version: 1.8.21
176
+ signing_key:
177
+ specification_version: 3
178
+ summary: Sinew - structured web crawling using recipes.
179
+ test_files: []