sinew 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,6 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
5
+ rdoc
6
+ sample.csv
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source "http://rubygems.org"
2
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2012 Adam Doppelt
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,34 @@
1
+ ## Welcome to Sinew
2
+
3
+ Sinew collects structured data from web sites (screen scraping). It provides a Ruby DSL built for crawling, a robust caching system, and integration with [Nokogiri](http://nokogiri.org). Though small, this project is the culmination of years of effort based on crawling systems built at several different companies.
4
+
5
+ Sinew requires Ruby 1.9, [HTML Tidy](http://tidy.sourceforge.net) and [Curl](http://curl.haxx.se).
6
+
7
+ ## Example
8
+
9
+ Here's an example for collecting Amazon's bestseller list:
10
+
11
+ ```ruby
12
+ # get the url
13
+ get "http://www.amazon.com/gp/bestsellers/books/ref=sv_b_3"
14
+
15
+ # use nokogiri to find books
16
+ noko.css(".zg_itemRow").each do |item|
17
+ # pull out the stuff we care about using nokogiri
18
+ row = { }
19
+ row[:url] = item.css(".zg_title a").first[:href]
20
+ row[:title] = item.css(".zg_title")
21
+ row[:img] = item.css(".zg_itemImage_normal img").first[:src]
22
+
23
+ # append a row to the csv
24
+ csv_emit(row)
25
+ end
26
+ ```
27
+
28
+ If you paste this into a file called `bestsellers.sinew` and run `sinew bestsellers.sinew`, it will create a `bestsellers.csv` file containing the url, title and img for each bestseller.
29
+
30
+ ## Full Documentation
31
+
32
+ Full docs are in the wiki:
33
+
34
+ https://github.com/gurgeous/sinew/wiki
@@ -0,0 +1,30 @@
1
+ require "bundler"
2
+ require "bundler/setup"
3
+ require "rake"
4
+
5
+ $LOAD_PATH << File.expand_path("../lib", __FILE__)
6
+ require "sinew/version"
7
+
8
+ #
9
+ # gem
10
+ #
11
+
12
+ task :gem => :build
13
+ task :build do
14
+ system "gem build --quiet sinew.gemspec"
15
+ end
16
+
17
+ task :install => :build do
18
+ system "sudo gem install --quiet sinew-#{Sinew::VERSION}.gem"
19
+ end
20
+
21
+ task :release => :build do
22
+ system "git tag -a #{Sinew::VERSION} -m 'Tagging #{Sinew::VERSION}'"
23
+ system "git push --tags"
24
+ system "gem push sinew-#{Sinew::VERSION}.gem"
25
+ end
26
+
27
+ task :default => :gem
28
+
29
+ # to test:
30
+ # block ; rake install && rm -rf ~/.sinew/www.amazon.com && /usr/local/bin/sinew sample.sinew
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "sinew"
4
+ require "trollop"
5
+
6
+ # ARGV
7
+ options = Trollop.options do
8
+ banner "Usage: sinew [options] <gub.sinew>"
9
+ opt :verbose, "Dump every row"
10
+ end
11
+ Trollop.die "need a .sinew file to run against" if ARGV.blank?
12
+
13
+ # now run!
14
+ ARGV.each do |i|
15
+ Sinew::Main.new(options.merge(file: i))
16
+ end
@@ -0,0 +1,6 @@
1
+ require "sinew/version"
2
+ require "sinew/util"
3
+ require "sinew/curler"
4
+ require "sinew/nokogiri_ext"
5
+ require "sinew/text_util"
6
+ require "sinew/main"
@@ -0,0 +1,173 @@
1
+ require "uri"
2
+
3
+ module Sinew
4
+ class Curler
5
+ class Error < StandardError ; end
6
+
7
+ DEFAULT_OPTIONS = {
8
+ :cache_errors => true,
9
+ :max_time => 30,
10
+ :retry => 3,
11
+ :verbose => true,
12
+ }
13
+
14
+ attr_reader :url, :uri, :root
15
+
16
+ def initialize(options = {})
17
+ @options = DEFAULT_OPTIONS.merge(options)
18
+ @curl_args = ["--silent", "--fail", "--user-agent", @options[:user_agent], "--max-time", @options[:max_time], "--retry", @options[:retry], "--location", "--max-redirs", "3"]
19
+ @last_request = Time.at(0)
20
+
21
+ @root = @options[:dir]
22
+ if !@root
23
+ if File.exists?(ENV["HOME"]) && File.stat(ENV["HOME"]).writable?
24
+ @root = "#{ENV["HOME"]}/.sinew"
25
+ else
26
+ @root = "/tmp/sinew"
27
+ end
28
+ end
29
+ end
30
+
31
+ def get(url)
32
+ curl(url, nil)
33
+ end
34
+
35
+ def post(url, body)
36
+ curl(url, body)
37
+ end
38
+
39
+ def curl(url, body)
40
+ #
41
+ # prepare url/uri and calculate paths
42
+ #
43
+
44
+ @uri = url.is_a?(URI) ? url : Curler.url_to_uri(url.to_s)
45
+ @url = @uri.to_s
46
+
47
+ path = fullpath(@uri)
48
+ path = "#{path},#{Util.pathify(body)}" if body
49
+
50
+ # shorten long paths
51
+ if path.length > 250
52
+ dir, base = File.dirname(path), File.basename(path)
53
+ path = "#{dir}/#{Util.md5(base)}"
54
+ end
55
+
56
+ head = "#{File.dirname(path)}/head/#{File.basename(path)}"
57
+
58
+ if !File.exists?(path)
59
+ verbose(body ? "curl #{@url} (POST)" : "curl #{@url}")
60
+ tmp = "/tmp/curler_#{Util.random_text(6)}"
61
+ tmph = "#{tmp}.head"
62
+ begin
63
+ rate_limit
64
+ Util.mkdir_if_necessary(File.dirname(path))
65
+ Util.mkdir_if_necessary(File.dirname(head))
66
+ begin
67
+ command = []
68
+ command += @curl_args
69
+ if body
70
+ command += ["--data-binary", body]
71
+ command += ["--header", "Content-Type: application/x-www-form-urlencoded"]
72
+ end
73
+ command += ["--output", tmp]
74
+ command += ["--dump-header", tmph]
75
+ command << @url
76
+
77
+ Util.run("curl", command)
78
+
79
+ # empty response?
80
+ if !File.exists?(tmp)
81
+ Util.touch(tmp)
82
+ Util.touch(tmph)
83
+ end
84
+ rescue Util::RunError => e
85
+ message = "curl error"
86
+ if e.message =~ /(\d+)$/
87
+ message = "#{message} (#{$1})"
88
+ end
89
+
90
+ # cache the error?
91
+ if @options[:cache_errors]
92
+ File.open(path, "w") { |f| f.puts "" }
93
+ File.open(head, "w") { |f| f.puts "CURLER_ERROR\t#{message}" }
94
+ end
95
+
96
+ raise Error, message
97
+ end
98
+ Util.mv(tmp, path)
99
+ Util.mv(tmph, head)
100
+ ensure
101
+ Util.rm_if_necessary(tmp)
102
+ Util.rm_if_necessary(tmph)
103
+ end
104
+ end
105
+
106
+ #
107
+ # handle redirects (recalculate @uri/@url)
108
+ #
109
+
110
+ if File.exists?(head)
111
+ head_contents = File.read(head)
112
+ # handle cached errors
113
+ if head_contents =~ /^CURLER_ERROR\t(.*)/
114
+ raise Error, $1
115
+ end
116
+ original = @uri
117
+ head_contents.scan(/\A(HTTP\/\d\.\d (\d+).*?\r\n\r\n)/m) do |i|
118
+ headers, code = $1, $2
119
+ if code =~ /^3/
120
+ if redir = headers[/^Location: ([^\r\n]+)/, 1]
121
+ @uri += redir
122
+ @url = @uri.to_s
123
+ end
124
+ end
125
+ end
126
+ # kill unnecessary head files
127
+ if original == @uri
128
+ Util.rm(head)
129
+ end
130
+ end
131
+
132
+ path
133
+ end
134
+
135
+ def verbose(s)
136
+ $stderr.puts s if @options[:verbose]
137
+ end
138
+
139
+ #
140
+ # helpers
141
+ #
142
+
143
+ def fullpath(uri)
144
+ "#{@root}/#{Curler.uri_to_path(uri)}"
145
+ end
146
+
147
+ def uncache!(url)
148
+ Util.rm_if_necessary("#{@root}/#{Curler.url_to_path(url)}")
149
+ end
150
+
151
+ def self.url_to_uri(url)
152
+ url = url.gsub(" ", "%20")
153
+ url = url.gsub("'", "%27")
154
+ URI.parse(url)
155
+ end
156
+
157
+ def self.url_to_path(url)
158
+ uri_to_path(url_to_uri(url))
159
+ end
160
+
161
+ def self.uri_to_path(uri)
162
+ s = uri.path
163
+ s = "#{s}?#{uri.query}" if uri.query
164
+ "#{Util.pathify(uri.host)}/#{Util.pathify(s)}"
165
+ end
166
+
167
+ def rate_limit
168
+ sleep = (@last_request + 1) - Time.now
169
+ sleep(sleep) if sleep > 0
170
+ @last_request = Time.now
171
+ end
172
+ end
173
+ end
@@ -0,0 +1,172 @@
1
+ require "nokogiri" # must be loaded before awesome_print
2
+ require "awesome_print"
3
+ require "cgi"
4
+ require "csv"
5
+ require "htmlentities"
6
+ require "stringex"
7
+
8
+ module Sinew
9
+ class Main
10
+ CODER = HTMLEntities.new
11
+
12
+ attr_accessor :url, :uri, :raw
13
+
14
+ def initialize(options)
15
+ @options = options.dup
16
+ @csv = @path = nil
17
+
18
+ @curler = Curler.new(user_agent: "sinew/#{VERSION}")
19
+
20
+ file = @options[:file]
21
+ if !File.exists?(file)
22
+ Util.fatal("#{file} not found")
23
+ end
24
+
25
+ tm = Time.now
26
+ instance_eval(File.read(file, mode: "rb"), file)
27
+ if @path
28
+ Util.banner("Finished #{@path} in #{(Time.now - tm).to_i}s.")
29
+ else
30
+ Util.banner("Finished in #{(Time.now - tm).to_i}s.")
31
+ end
32
+ end
33
+
34
+ def get(url, params = nil)
35
+ http(url, params, :get)
36
+ end
37
+
38
+ def post(url, params = nil)
39
+ http(url, params, :post)
40
+ end
41
+
42
+ def http(url, params, method)
43
+ url = url.to_s
44
+ raise "invalid url #{url.inspect}" if url !~ /^http/i
45
+
46
+ # decode entities
47
+ url = CODER.decode(url)
48
+
49
+ # handle params
50
+ body = nil
51
+ if params
52
+ q = params.map { |key, value| [CGI.escape(key.to_s), CGI.escape(value.to_s)] }.sort
53
+ q = q.map { |key, value| "#{key}=#{value}" }.join("&")
54
+ if method == :get
55
+ separator = url.include?(??) ? "&" : "?"
56
+ url = "#{url}#{separator}#{q}"
57
+ else
58
+ body = q
59
+ end
60
+ end
61
+
62
+ begin
63
+ if method == :get
64
+ path = @curler.get(url)
65
+ else
66
+ path = @curler.post(url, body)
67
+ end
68
+ @raw = File.read(path, mode: "rb")
69
+ rescue Curler::Error => e
70
+ $stderr.puts "xxx #{e.message}"
71
+ @raw = ""
72
+ end
73
+
74
+ # setup local variables
75
+ @url, @uri = @curler.url, @curler.uri
76
+ @html = nil
77
+ @clean = nil
78
+ @noko = nil
79
+
80
+ nil
81
+ end
82
+
83
+ #
84
+ # lazy accessors for cleaned up version
85
+ #
86
+
87
+ def html
88
+ @html ||= begin
89
+ s = TextUtil.html_tidy(@raw)
90
+ nelements = @raw.count("<")
91
+ if nelements > 1
92
+ # is there a problem with tidy?
93
+ percent = 100 * s.count("<") / nelements
94
+ if percent < 80
95
+ # bad xml processing instruction? Try fixing it.
96
+ maybe = TextUtil.html_tidy(@raw.gsub(/<\?[^>]*?>/, ""))
97
+ new_percent = 100 * maybe.count("<") / nelements
98
+ if new_percent > 80
99
+ # yes!
100
+ s = maybe
101
+ else
102
+ Util.warning "Hm - it looks like tidy ate some of your file (#{percent}%)" if percent < 90
103
+ end
104
+ end
105
+ end
106
+ s
107
+ end
108
+ end
109
+
110
+ def clean
111
+ @clean ||= TextUtil.html_clean_from_tidy(self.html)
112
+ end
113
+
114
+ def noko
115
+ @noko ||= Nokogiri::HTML(html)
116
+ end
117
+
118
+ #
119
+ # csv
120
+ #
121
+
122
+ def csv_header(*args)
123
+ args = args.flatten
124
+ if args.first.is_a?(String)
125
+ file = args.shift
126
+ if file !~ /^\//
127
+ file = "#{File.dirname(@options[:file])}/#{file}"
128
+ end
129
+ else
130
+ file = @options[:file]
131
+ end
132
+ ext = File.extname(file)
133
+ file = ext.empty? ? "#{file}.csv" : file.gsub(ext, ".csv")
134
+
135
+ @path = file
136
+ @csv = CSV.open(file, "wb")
137
+ @csv_keys = args
138
+ @csv << @csv_keys
139
+ Util.banner("Writing to #{@path}...")
140
+ end
141
+
142
+ def normalize(key, s)
143
+ case s
144
+ when Nokogiri::XML::Element, Nokogiri::XML::NodeSet
145
+ s = s.inner_html
146
+ when Array
147
+ s = s.map { |j| j.to_s }.join("|")
148
+ else
149
+ s = s.to_s
150
+ end
151
+ s = TextUtil.untag(s)
152
+ s = s.convert_accented_entities
153
+ s = TextUtil.unent(s)
154
+ s = s.to_ascii.squish
155
+ s
156
+ end
157
+
158
+ def csv_emit(row, options = {})
159
+ csv_header(row.keys.sort) if !@csv
160
+
161
+ print = { }
162
+ row = @csv_keys.map do |i|
163
+ s = normalize(i, row[i])
164
+ print[i] = s if !s.empty?
165
+ s
166
+ end
167
+ $stderr.puts print.ai if @options[:verbose]
168
+ @csv << row
169
+ @csv.flush
170
+ end
171
+ end
172
+ end
@@ -0,0 +1,27 @@
1
+ require "nokogiri"
2
+
3
+ # modify NodeSet to join with SPACE instead of empty string
4
+ class Nokogiri::XML::NodeSet
5
+ alias :old_inner_html :inner_html
6
+ alias :old_inner_text :inner_text
7
+
8
+ def inner_text
9
+ collect { |i| i.inner_text }.join(" ")
10
+ end
11
+ def inner_html *args
12
+ collect { |i| i.inner_html(*args) }.join(" ")
13
+ end
14
+ end
15
+
16
+ # text_just_me
17
+ class Nokogiri::XML::Node
18
+ def text_just_me
19
+ t = children.find { |i| i.node_type == Nokogiri::XML::Node::TEXT_NODE }
20
+ t && t.text
21
+ end
22
+ end
23
+ class Nokogiri::XML::NodeSet
24
+ def text_just_me
25
+ map { |i| i.text_just_me }.join(" ")
26
+ end
27
+ end
@@ -0,0 +1,103 @@
1
+ require "active_support/core_ext"
2
+ require "set"
3
+
4
+ module Sinew
5
+ module TextUtil
6
+ extend self
7
+
8
+ ATTRS_KEEP = Set.new %w(a img iframe)
9
+ TIDY_OPTIONS = {
10
+ "-asxml" => nil,
11
+ "-bare" => nil,
12
+ "-quiet" => nil,
13
+ "-utf8" => nil,
14
+ "-wrap" => 0,
15
+ "--doctype" => "omit",
16
+ "--hide-comments" => "yes",
17
+ "--numeric-entities" => "no",
18
+ "--preserve-entities" => "yes",
19
+ "--force-output" => "yes",
20
+ "-f" => "/dev/null",
21
+ }
22
+
23
+ XML_ENTITIES = { "&"=>"&amp;", "<"=>"&lt;", ">"=>"&gt;", "'"=>"&apos;", '"'=>"&quot;" }
24
+ XML_ENTITIES_INV = XML_ENTITIES.invert
25
+ COMMON_ENTITIES_INV = XML_ENTITIES_INV.merge(
26
+ "&frac12;" => "1/2",
27
+ "&frac14;" => "1/4",
28
+ "&frac34;" => "3/4",
29
+ "&ldquo;" => '"',
30
+ "&lsquo;" => "'",
31
+ "&mdash;" => "-",
32
+ "&nbsp;" => " ",
33
+ "&ndash;" => "-",
34
+ "&rdquo;" => '"',
35
+ "&rsquo;" => "'",
36
+ "&tilde;" => "~",
37
+ "&#34;" => '"',
38
+ "&#39;" => "'",
39
+ "&#160;" => " ",
40
+ "&#8232;" => "\n"
41
+ )
42
+
43
+ #
44
+ # tidy/clean
45
+ #
46
+
47
+ def html_tidy(s)
48
+ # run tidy
49
+ args = TIDY_OPTIONS.map { |k, v| "#{k} #{v}" }.join(" ")
50
+ s = IO.popen("tidy #{args}", "rb+") do |f|
51
+ f.write(s)
52
+ f.close_write
53
+ f.read
54
+ end
55
+ raise "could not run tidy" if ($? >> 8) > 2
56
+
57
+ # now kill some tags
58
+ s.sub!(/<html\b[^>]+>/, "<html>")
59
+ s.gsub!(/<\/?(meta|link)\b[^>]*>/m, "")
60
+ s.gsub!(/<(style|script)\b[^>]*(\/>|>.*?<\/\1\b>)/m, "")
61
+ s.gsub!(/<\?[^>]*>/m, "")
62
+ s.squish!
63
+
64
+ # kill whitespace around tags
65
+ s.gsub!(/ ?<([^>]+)> ?/, "<\\1>")
66
+
67
+ s
68
+ end
69
+
70
+ def html_clean(s)
71
+ html_clean_from_tidy(html_tidy(s))
72
+ end
73
+
74
+ def html_clean_from_tidy(s)
75
+ # then kill most attrs
76
+ s = s.dup
77
+ s.gsub!(/<([^\s>]+)[^>]*?(\/)?>/) do |i|
78
+ ATTRS_KEEP.include?($1) ? i : "<#{$1}#{$2}>"
79
+ end
80
+ s
81
+ end
82
+
83
+ #
84
+ # untag/unent
85
+ #
86
+
87
+ def xml_escape(s)
88
+ s.gsub(/[&<>'"]/) { |i| XML_ENTITIES[i] }
89
+ end
90
+
91
+ def xml_unescape(s)
92
+ s.gsub(/&(amp|lt|gt|apos|quot);/) { |i| XML_ENTITIES_INV[i] }
93
+ end
94
+
95
+ def untag(s)
96
+ s.gsub(/<[^>]+>/, " ")
97
+ end
98
+
99
+ def unent(s)
100
+ s.gsub(/&#?[a-z0-9]{2,};/) { |i| COMMON_ENTITIES_INV[i] }
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,236 @@
1
+ require "digest/md5"
2
+ require "etc"
3
+ require "fileutils"
4
+
5
+ module Sinew
6
+ # Helper module for executing commands and printing stuff
7
+ # out.
8
+ #
9
+ # The general idea is to only print commands that are actually
10
+ # interesting. For example, mkdir_if_necessary won't print anything
11
+ # if the directory already exists. That way we can scan output and
12
+ # see what changes were made without getting lost in repetitive
13
+ # commands that had no actual effect.
14
+ module Util
15
+ class RunError < StandardError ; end
16
+
17
+ extend self
18
+
19
+ RESET = "\e[0m"
20
+ RED = "\e[1;37;41m"
21
+ GREEN = "\e[1;37;42m"
22
+ YELLOW = "\e[1;37;43m"
23
+ BLUE = "\e[1;37;44m"
24
+ MAGENTA = "\e[1;37;45m"
25
+ CYAN = "\e[1;37;46m"
26
+
27
+ #
28
+ # running commands
29
+ #
30
+
31
+ # Make all commands echo before running.
32
+ def run_verbose!
33
+ @run_verbose = true
34
+ end
35
+
36
+ # Run a command, raise an error upon failure. Output goes to
37
+ # $stdout/$stderr.
38
+ def run(command, args = nil)
39
+ line = nil
40
+ if args
41
+ args = args.map(&:to_s)
42
+ line = "#{command} #{args.join(" ")}"
43
+ vputs line
44
+ system(command, *args)
45
+ else
46
+ line = command
47
+ vputs line
48
+ system(command)
49
+ end
50
+ if $? != 0
51
+ if $?.termsig == Signal.list["INT"]
52
+ raise "#{line} interrupted"
53
+ end
54
+ raise RunError, "#{line} failed : #{$?.to_i / 256}"
55
+ end
56
+ end
57
+
58
+ # Like mkdir -p. Optionally, set the owner and mode.
59
+ def mkdir(dir, owner = nil, mode = nil)
60
+ FileUtils.mkdir_p(dir, :verbose => verbose?)
61
+ chmod(dir, mode) if mode
62
+ chown(dir, owner) if owner
63
+ end
64
+
65
+ # mkdir only if the directory doesn't already exist. Optionally,
66
+ # set the owner and mode.
67
+ def mkdir_if_necessary(dir, owner = nil, mode = nil)
68
+ mkdir(dir, owner, mode) if !(File.exists?(dir) || File.symlink?(dir))
69
+ end
70
+
71
+ # rm a dir and recreate it.
72
+ def rm_and_mkdir(dir)
73
+ raise "don't do this" if dir == ""
74
+ run "rm -rf #{dir} && mkdir -p #{dir}"
75
+ end
76
+
77
+ # Are two files different?
78
+ def different?(a, b)
79
+ !FileUtils.compare_file(a, b)
80
+ end
81
+
82
+ # Copy file or dir from src to dst. Optionally, set the mode and
83
+ # owner of dst.
84
+ def cp(src, dst, owner = nil, mode = nil)
85
+ FileUtils.cp_r(src, dst, :preserve => true, :verbose => verbose?)
86
+ if owner && !File.symlink?(dst)
87
+ chown(dst, owner)
88
+ end
89
+ if mode
90
+ chmod(dst, mode)
91
+ end
92
+ end
93
+
94
+ # Copy file or dir from src to dst, but create the dst directory
95
+ # first if necessary. Optionally, set the mode and owner of dst.
96
+ def cp_with_mkdir(src, dst, owner = nil, mode = nil)
97
+ mkdir_if_necessary(File.dirname(dst))
98
+ cp(src, dst, owner, mode)
99
+ end
100
+
101
+ # Copy file or dir from src to dst, but ONLY if dst doesn't exist
102
+ # or has different contents than src. Optionally, set the mode and
103
+ # owner of dst.
104
+ def cp_if_necessary(src, dst, owner = nil, mode = nil)
105
+ if !File.exists?(dst) || different?(src, dst)
106
+ cp(src, dst, owner, mode)
107
+ true
108
+ end
109
+ end
110
+
111
+ # Move src to dst. Because this uses FileUtils, it works even if
112
+ # dst is on a different partition.
113
+ def mv(src, dst)
114
+ FileUtils.mv(src, dst, :verbose => verbose?)
115
+ end
116
+
117
+ # Move src to dst, but create the dst directory first if
118
+ # necessary.
119
+ def mv_with_mkdir(src, dst)
120
+ mkdir_if_necessary(File.dirname(dst))
121
+ mv(src, dst)
122
+ end
123
+
124
+ # Chown file to be owned by user.
125
+ def chown(file, user)
126
+ user = user.to_s
127
+ # who is the current owner?
128
+ @uids ||= {}
129
+ @uids[user] ||= Etc.getpwnam(user).uid
130
+ uid = @uids[user]
131
+ if File.stat(file).uid != uid
132
+ run "chown #{user}:#{user} '#{file}'"
133
+ end
134
+ end
135
+
136
+ # Chmod file to a new mode.
137
+ def chmod(file, mode)
138
+ if File.stat(file).mode != mode
139
+ FileUtils.chmod(mode, file, :verbose => verbose?)
140
+ end
141
+ end
142
+
143
+ # rm a file
144
+ def rm(file)
145
+ FileUtils.rm(file, :force => true, :verbose => verbose?)
146
+ end
147
+
148
+ # rm a file, but only if it exists.
149
+ def rm_if_necessary(file)
150
+ if File.exists?(file)
151
+ rm(file)
152
+ true
153
+ end
154
+ end
155
+
156
+ # Create a symlink from src to dst.
157
+ def ln(src, dst)
158
+ FileUtils.ln_sf(src, dst, :verbose => verbose?)
159
+ end
160
+
161
+ # Create a symlink from src to dst, but only if it hasn't already
162
+ # been created.
163
+ def ln_if_necessary(src, dst)
164
+ ln = false
165
+ if !File.symlink?(dst)
166
+ ln = true
167
+ elsif File.readlink(dst) != src
168
+ rm(dst)
169
+ ln = true
170
+ end
171
+ if ln
172
+ ln(src, dst)
173
+ true
174
+ end
175
+ end
176
+
177
+ # Touch a file
178
+ def touch(file)
179
+ FileUtils.touch(file)
180
+ end
181
+
182
+ # A nice printout in green.
183
+ def banner(s, color = GREEN)
184
+ s = "#{s} ".ljust(72, " ")
185
+ $stderr.write "#{color}[#{Time.new.strftime('%H:%M:%S')}] #{s}#{RESET}\n"
186
+ $stderr.flush
187
+ end
188
+
189
+ # Print a warning in yellow.
190
+ def warning(msg)
191
+ banner("Warning: #{msg}", YELLOW)
192
+ end
193
+
194
+ # Print a fatal error in red, then exit.
195
+ def fatal(msg)
196
+ banner(msg, RED)
197
+ exit(1)
198
+ end
199
+
200
+ # Generate some random text
201
+ def random_text(len)
202
+ chars = ("A".."Z").to_a + ("a".."z").to_a + ("0".."9").to_a
203
+ (1..len).map { chars[rand(chars.length - 1)] }.join("")
204
+ end
205
+
206
+ # Convert a string into something that could be a path segment
207
+ def pathify(s)
208
+ s = s.gsub(/^\//, "")
209
+ s = s.gsub("..", ",")
210
+ s = s.gsub(/[?\/&]/, ",")
211
+ s = s.gsub(/[^A-Za-z0-9_.,=-]/) do |i|
212
+ hex = i.unpack("H2").first
213
+ "%#{hex}"
214
+ end
215
+ s = "_root_" if s.empty?
216
+ s = s.downcase
217
+ s
218
+ end
219
+
220
+ # checksum some text
221
+ def md5(s)
222
+ Digest::MD5.hexdigest(s.to_s)
223
+ end
224
+
225
+ private
226
+
227
+ # Returns true if verbosity is turned on.
228
+ def verbose?
229
+ @run_verbose ||= nil
230
+ end
231
+
232
+ def vputs(s)
233
+ $stderr.puts s if verbose?
234
+ end
235
+ end
236
+ end
@@ -0,0 +1,4 @@
1
+ module Sinew
2
+ # Gem version
3
+ VERSION = "1.0.0"
4
+ end
@@ -0,0 +1,8 @@
1
+ get "http://www.amazon.com/gp/bestsellers/books/ref=sv_b_3"
2
+ noko.css(".zg_itemRow").each do |item|
3
+ row = { }
4
+ row[:url] = item.css(".zg_title a").first[:href]
5
+ row[:title] = item.css(".zg_title")
6
+ row[:img] = item.css(".zg_itemImage_normal img").first[:src]
7
+ csv_emit(row)
8
+ end
@@ -0,0 +1,29 @@
1
+ $LOAD_PATH << File.expand_path("../lib", __FILE__)
2
+
3
+ require "sinew/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "sinew"
7
+ s.version = Sinew::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["Adam Doppelt"]
10
+ s.email = ["amd@gurge.com"]
11
+ s.homepage = "http://github.com/gurgeous/sinew"
12
+ s.summary = "Sinew - structured web crawling using recipes."
13
+ s.description = "Crawl web sites easily using ruby recipes, with caching and nokogiri."
14
+
15
+ s.rubyforge_project = "sinew"
16
+
17
+ s.add_runtime_dependency "activesupport"
18
+ s.add_runtime_dependency "awesome_print"
19
+ s.add_runtime_dependency "htmlentities"
20
+ s.add_runtime_dependency "nokogiri"
21
+ s.add_runtime_dependency "stringex"
22
+ s.add_runtime_dependency "trollop"
23
+ s.add_development_dependency "rake"
24
+
25
+ s.files = `git ls-files`.split("\n")
26
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
27
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
28
+ s.require_paths = ["lib"]
29
+ end
metadata ADDED
@@ -0,0 +1,179 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sinew
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Adam Doppelt
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-06-04 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: activesupport
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: awesome_print
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: htmlentities
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: nokogiri
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: stringex
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :runtime
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ - !ruby/object:Gem::Dependency
95
+ name: trollop
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :runtime
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ - !ruby/object:Gem::Dependency
111
+ name: rake
112
+ requirement: !ruby/object:Gem::Requirement
113
+ none: false
114
+ requirements:
115
+ - - ! '>='
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ none: false
122
+ requirements:
123
+ - - ! '>='
124
+ - !ruby/object:Gem::Version
125
+ version: '0'
126
+ description: Crawl web sites easily using ruby recipes, with caching and nokogiri.
127
+ email:
128
+ - amd@gurge.com
129
+ executables:
130
+ - sinew
131
+ extensions: []
132
+ extra_rdoc_files: []
133
+ files:
134
+ - .gitignore
135
+ - Gemfile
136
+ - LICENSE
137
+ - README.md
138
+ - Rakefile
139
+ - bin/sinew
140
+ - lib/sinew.rb
141
+ - lib/sinew/curler.rb
142
+ - lib/sinew/main.rb
143
+ - lib/sinew/nokogiri_ext.rb
144
+ - lib/sinew/text_util.rb
145
+ - lib/sinew/util.rb
146
+ - lib/sinew/version.rb
147
+ - sample.sinew
148
+ - sinew.gemspec
149
+ homepage: http://github.com/gurgeous/sinew
150
+ licenses: []
151
+ post_install_message:
152
+ rdoc_options: []
153
+ require_paths:
154
+ - lib
155
+ required_ruby_version: !ruby/object:Gem::Requirement
156
+ none: false
157
+ requirements:
158
+ - - ! '>='
159
+ - !ruby/object:Gem::Version
160
+ version: '0'
161
+ segments:
162
+ - 0
163
+ hash: 106543959769779396
164
+ required_rubygems_version: !ruby/object:Gem::Requirement
165
+ none: false
166
+ requirements:
167
+ - - ! '>='
168
+ - !ruby/object:Gem::Version
169
+ version: '0'
170
+ segments:
171
+ - 0
172
+ hash: 106543959769779396
173
+ requirements: []
174
+ rubyforge_project: sinew
175
+ rubygems_version: 1.8.21
176
+ signing_key:
177
+ specification_version: 3
178
+ summary: Sinew - structured web crawling using recipes.
179
+ test_files: []