sinew 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -1,6 +1,7 @@
1
1
  require "bundler"
2
2
  require "bundler/setup"
3
3
  require "rake"
4
+ require "rake/testtask"
4
5
 
5
6
  $LOAD_PATH << File.expand_path("../lib", __FILE__)
6
7
  require "sinew/version"
@@ -24,7 +25,15 @@ task :release => :build do
24
25
  system "gem push sinew-#{Sinew::VERSION}.gem"
25
26
  end
26
27
 
27
- task :default => :gem
28
+ #
29
+ # minitest
30
+ #
31
+
32
+ Rake::TestTask.new(:test) do |test|
33
+ test.libs << "test"
34
+ end
35
+
36
+ task :default => :test
28
37
 
29
38
  # to test:
30
39
  # block ; rake install && rm -rf ~/.sinew/www.amazon.com && /usr/local/bin/sinew sample.sinew
data/bin/sinew CHANGED
@@ -3,10 +3,14 @@
3
3
  require "sinew"
4
4
  require "trollop"
5
5
 
6
+ raise "Sinew requires Ruby 1.9 or higher" if RUBY_VERSION < "1.9"
7
+
6
8
  # ARGV
7
9
  options = Trollop.options do
8
10
  banner "Usage: sinew [options] <gub.sinew>"
11
+ opt :cache, "Set the cache directory (defaults to ~/.sinew)"
9
12
  opt :verbose, "Dump every row"
13
+ opt :quiet, "Be quiet"
10
14
  end
11
15
  Trollop.die "need a .sinew file to run against" if ARGV.blank?
12
16
 
data/lib/sinew/main.rb CHANGED
@@ -13,73 +13,17 @@ module Sinew
13
13
 
14
14
  def initialize(options)
15
15
  @options = options.dup
16
- @csv = @path = nil
17
-
18
- @curler = Curler.new(user_agent: "sinew/#{VERSION}")
19
-
20
- file = @options[:file]
21
- if !File.exists?(file)
22
- Util.fatal("#{file} not found")
23
- end
24
-
25
- tm = Time.now
26
- instance_eval(File.read(file, mode: "rb"), file)
27
- if @path
28
- Util.banner("Finished #{@path} in #{(Time.now - tm).to_i}s.")
29
- else
30
- Util.banner("Finished in #{(Time.now - tm).to_i}s.")
31
- end
16
+ _run if !@options[:test]
32
17
  end
33
18
 
34
19
  def get(url, params = nil)
35
- http(url, params, :get)
20
+ _http(url, params, :get)
36
21
  end
37
22
 
38
23
  def post(url, params = nil)
39
- http(url, params, :post)
24
+ _http(url, params, :post)
40
25
  end
41
26
 
42
- def http(url, params, method)
43
- url = url.to_s
44
- raise "invalid url #{url.inspect}" if url !~ /^http/i
45
-
46
- # decode entities
47
- url = CODER.decode(url)
48
-
49
- # handle params
50
- body = nil
51
- if params
52
- q = params.map { |key, value| [CGI.escape(key.to_s), CGI.escape(value.to_s)] }.sort
53
- q = q.map { |key, value| "#{key}=#{value}" }.join("&")
54
- if method == :get
55
- separator = url.include?(??) ? "&" : "?"
56
- url = "#{url}#{separator}#{q}"
57
- else
58
- body = q
59
- end
60
- end
61
-
62
- begin
63
- if method == :get
64
- path = @curler.get(url)
65
- else
66
- path = @curler.post(url, body)
67
- end
68
- @raw = File.read(path, mode: "rb")
69
- rescue Curler::Error => e
70
- $stderr.puts "xxx #{e.message}"
71
- @raw = ""
72
- end
73
-
74
- # setup local variables
75
- @url, @uri = @curler.url, @curler.uri
76
- @html = nil
77
- @clean = nil
78
- @noko = nil
79
-
80
- nil
81
- end
82
-
83
27
  #
84
28
  # lazy accessors for cleaned up version
85
29
  #
@@ -136,10 +80,92 @@ module Sinew
136
80
  @csv = CSV.open(file, "wb")
137
81
  @csv_keys = args
138
82
  @csv << @csv_keys
139
- Util.banner("Writing to #{@path}...")
83
+ _banner("Writing to #{@path}...")
140
84
  end
141
85
 
142
- def normalize(key, s)
86
+ def csv_emit(row, options = {})
87
+ csv_header(row.keys.sort) if !@csv
88
+
89
+ print = { }
90
+ row = @csv_keys.map do |i|
91
+ s = _normalize(row[i], i)
92
+ print[i] = s if !s.empty?
93
+ s
94
+ end
95
+ $stderr.puts print.ai if @options[:verbose]
96
+ @csv << row
97
+ @csv.flush
98
+ end
99
+
100
+ protected
101
+
102
+ def _curler
103
+ @curler ||= begin
104
+ # curler
105
+ options = { user_agent: "sinew/#{VERSION}" }
106
+ options[:dir] = @options[:cache] if @options[:cache]
107
+ options[:verbose] = false if @options[:quiet]
108
+ Curler.new(options)
109
+ end
110
+ end
111
+
112
+ def _run
113
+ @csv = @path = nil
114
+
115
+ file = @options[:file]
116
+ if !File.exists?(file)
117
+ Util.fatal("#{file} not found")
118
+ end
119
+
120
+ tm = Time.now
121
+ instance_eval(File.read(file, mode: "rb"), file)
122
+ if @path
123
+ _banner("Finished #{@path} in #{(Time.now - tm).to_i}s.")
124
+ else
125
+ _banner("Finished in #{(Time.now - tm).to_i}s.")
126
+ end
127
+ end
128
+
129
+ def _http(url, params, method)
130
+ url = url.to_s
131
+ raise "invalid url #{url.inspect}" if url !~ /^http/i
132
+
133
+ # decode entities
134
+ url = CODER.decode(url)
135
+
136
+ # handle params
137
+ body = nil
138
+ if params
139
+ q = params.map { |key, value| [CGI.escape(key.to_s), CGI.escape(value.to_s)] }.sort
140
+ q = q.map { |key, value| "#{key}=#{value}" }.join("&")
141
+ if method == :get
142
+ separator = url.include?(??) ? "&" : "?"
143
+ url = "#{url}#{separator}#{q}"
144
+ else
145
+ body = q
146
+ end
147
+ end
148
+
149
+ begin
150
+ if method == :get
151
+ path = _curler.get(url)
152
+ else
153
+ path = _curler.post(url, body)
154
+ end
155
+ @raw = File.read(path, mode: "rb")
156
+ rescue Curler::Error => e
157
+ $stderr.puts "xxx #{e.message}"
158
+ @raw = ""
159
+ end
160
+
161
+ # setup local variables
162
+ @url, @uri = _curler.url, _curler.uri
163
+ @html = nil
164
+ @clean = nil
165
+ @noko = nil
166
+ end
167
+
168
+ def _normalize(s, key = nil)
143
169
  case s
144
170
  when Nokogiri::XML::Element, Nokogiri::XML::NodeSet
145
171
  s = s.inner_html
@@ -155,18 +181,8 @@ module Sinew
155
181
  s
156
182
  end
157
183
 
158
- def csv_emit(row, options = {})
159
- csv_header(row.keys.sort) if !@csv
160
-
161
- print = { }
162
- row = @csv_keys.map do |i|
163
- s = normalize(i, row[i])
164
- print[i] = s if !s.empty?
165
- s
166
- end
167
- $stderr.puts print.ai if @options[:verbose]
168
- @csv << row
169
- @csv.flush
184
+ def _banner(s)
185
+ Util.banner(s) if !@options[:quiet]
170
186
  end
171
187
  end
172
188
  end
data/lib/sinew/version.rb CHANGED
@@ -1,4 +1,4 @@
1
1
  module Sinew
2
2
  # Gem version
3
- VERSION = "1.0.0"
3
+ VERSION = "1.0.1"
4
4
  end
data/test/helper.rb ADDED
@@ -0,0 +1,64 @@
1
+ require "active_support/core_ext"
2
+ require "test/unit"
3
+ require "sinew"
4
+
5
+ module Sinew
6
+ class TestCase < Test::Unit::TestCase
7
+ TMP = "/tmp/_test_sinew"
8
+ HTML_FILE = File.expand_path("#{File.dirname(__FILE__)}/test.html")
9
+ HTML = File.read(HTML_FILE)
10
+
11
+ #
12
+ # for mocking curl
13
+ #
14
+
15
+ def mock_curl_200
16
+ Proc.new do |cmd, args|
17
+ mock_curl(args, HTML, "HTTP/1.1 200 OK")
18
+ end
19
+ end
20
+
21
+ def mock_curl_302
22
+ Proc.new do |cmd, args|
23
+ mock_curl(args, "", "HTTP/1.1 302 Moved Temporarily\r\nLocation: http://www.gub.com")
24
+ end
25
+ end
26
+
27
+ def mock_curl_500
28
+ Proc.new do |cmd, args|
29
+ raise Util::RunError, "curl error"
30
+ end
31
+ end
32
+
33
+ def mock_curl(args, body, head)
34
+ File.write(args[args.index("--output") + 1], body)
35
+ File.write(args[args.index("--dump-header") + 1], "#{head}\r\n\r\n")
36
+ end
37
+ end
38
+ end
39
+
40
+ #
41
+ # from MiniTest, but not in the gem yet
42
+ #
43
+
44
+ class Object
45
+ def stub name, val_or_callable, &block
46
+ new_name = "__minitest_stub__#{name}"
47
+
48
+ metaclass = class << self; self; end
49
+ metaclass.send :alias_method, new_name, name
50
+ metaclass.send :define_method, name do |*args|
51
+ if val_or_callable.respond_to? :call then
52
+ val_or_callable.call(*args)
53
+ else
54
+ val_or_callable
55
+ end
56
+ end
57
+
58
+ yield
59
+ ensure
60
+ metaclass.send :undef_method, name
61
+ metaclass.send :alias_method, name, new_name
62
+ metaclass.send :undef_method, new_name
63
+ end
64
+ end
data/test/test.html ADDED
@@ -0,0 +1,39 @@
1
+ <html>
2
+ <head>
3
+ <title>Title</title>
4
+ <script>
5
+ alert("alert 1");
6
+ alert("alert 2");
7
+ </script>
8
+ </head>
9
+
10
+ <body>
11
+ <div id="main">
12
+ <span class="class1"> text1 </span>
13
+ <span class="class2"> text2 </span>
14
+
15
+ <!-- for test_normalize -->
16
+ <div id="element">text</div>
17
+ <div class="e">text1</div>
18
+ <div class="e">text2</div>
19
+ </div>
20
+
21
+ <div id="nokogiri_ext">
22
+ <ul>
23
+ <li>hello</li>
24
+ <li>world</li>
25
+ </ul>
26
+ <div>
27
+ a
28
+ <p>b<span>c</span></p>
29
+ <p>b<span>c</span></p>
30
+ </div>
31
+ </div>
32
+
33
+ <div id="text_util">
34
+ <!-- a comment that should be removed -->
35
+ <div class="will_be_removed"/>
36
+ <a class="will_be_preserved"/>
37
+ </div>
38
+ </body>
39
+ </html>
@@ -0,0 +1,70 @@
1
+ require "helper"
2
+
3
+ module Sinew
4
+ class TestCurler < TestCase
5
+ def setup
6
+ # create TMP dir
7
+ FileUtils.rm_rf(TMP) if File.exists?(TMP)
8
+ FileUtils.mkdir_p(TMP)
9
+
10
+ # curler, pointed at TMP
11
+ @curler = Curler.new(dir: TMP, verbose: false)
12
+ end
13
+
14
+ #
15
+ # tests
16
+ #
17
+
18
+ def test_200
19
+ Util.stub(:run, mock_curl_200) do
20
+ path = @curler.get("http://www.example.com")
21
+ assert_equal(HTML, File.read(path))
22
+ end
23
+ end
24
+
25
+ def test_500
26
+ assert_raises(Curler::Error) do
27
+ Util.stub(:run, mock_curl_500) do
28
+ @curler.get("http://www.example.com")
29
+ end
30
+ end
31
+ end
32
+
33
+ def test_cached
34
+ Util.stub(:run, mock_curl_200) do
35
+ assert_equal(HTML, File.read(@curler.get("http://www.example.com")))
36
+ end
37
+ # the file is cached, so this shouldn't produce an error
38
+ Util.stub(:run, mock_curl_500) do
39
+ @curler.get("http://www.example.com")
40
+ end
41
+ end
42
+
43
+ def test_302
44
+ Util.stub(:run, mock_curl_302) do
45
+ @curler.get("http://www.example.com")
46
+ assert_equal("http://www.gub.com", @curler.url)
47
+ end
48
+ end
49
+
50
+ def test_rate_limit
51
+ slept = false
52
+
53
+ # change Kernel#sleep to not really sleep!
54
+ Kernel.send(:alias_method, :old_sleep, :sleep)
55
+ Kernel.send(:define_method, :sleep) do |x|
56
+ slept = true
57
+ end
58
+
59
+ Util.stub(:run, mock_curl_200) do
60
+ @curler.get("http://www.example.com/1")
61
+ @curler.get("http://www.example.com/2")
62
+ end
63
+ assert(slept)
64
+
65
+ # restore old Kernel#sleep
66
+ Kernel.send(:alias_method, :sleep, :old_sleep)
67
+ Kernel.send(:undef_method, :old_sleep)
68
+ end
69
+ end
70
+ end
data/test/test_main.rb ADDED
@@ -0,0 +1,101 @@
1
+ # encoding: UTF-8
2
+
3
+ require "helper"
4
+
5
+ module Sinew
6
+ class TestMain < TestCase
7
+ RECIPE = "#{TMP}/test.sinew"
8
+ CSV = "#{TMP}/test.csv"
9
+
10
+ def setup
11
+ # create TMP dir
12
+ FileUtils.rm_rf(TMP) if File.exists?(TMP)
13
+ FileUtils.mkdir_p(TMP)
14
+ end
15
+
16
+ def run_recipe(recipe)
17
+ File.write(RECIPE, recipe)
18
+ Util.stub(:run, mock_curl_200) do
19
+ Sinew::Main.new(cache: TMP, file: RECIPE, quiet: true)
20
+ end
21
+ end
22
+
23
+ def test_noko
24
+ run_recipe <<'EOF'
25
+ get "http://www.example.com"
26
+ csv_header(:class, :text)
27
+ noko.css("#main span").each do |span|
28
+ csv_emit(class: span[:class], text: span.text)
29
+ end
30
+ EOF
31
+ assert_equal("class,text\nclass1,text1\nclass2,text2\n", File.read(CSV))
32
+ end
33
+
34
+ def test_raw
35
+ # test javascript, which is only crawlable with raw
36
+ run_recipe <<'EOF'
37
+ get "http://www.example.com"
38
+ raw.scan(/alert\("([^"]+)/) do
39
+ csv_emit(alert: $1)
40
+ end
41
+ EOF
42
+ assert_equal("alert\nalert 1\nalert 2\n", File.read(CSV))
43
+ end
44
+
45
+ def test_html
46
+ # note the cleaned up whitespace
47
+ run_recipe <<'EOF'
48
+ get "http://www.example.com"
49
+ csv_header(:class, :text)
50
+ html.scan(/<span class="(\w+)">(\w+)/) do
51
+ csv_emit(class: $1, text: $2)
52
+ end
53
+ EOF
54
+ assert_equal("class,text\nclass1,text1\nclass2,text2\n", File.read(CSV))
55
+ end
56
+
57
+ def test_clean
58
+ # note the removed attributes from span
59
+ run_recipe <<'EOF'
60
+ get "http://www.example.com"
61
+ clean.scan(/<span>(text\d)/) do
62
+ csv_emit(text: $1)
63
+ end
64
+ EOF
65
+ assert_equal("text\ntext1\ntext2\n", File.read(CSV))
66
+ end
67
+
68
+ def test_normalize
69
+ s = Sinew::Main.new(test: true)
70
+
71
+ #
72
+ # non-strings
73
+ #
74
+
75
+ noko = Nokogiri::HTML(HTML).css("#main")
76
+ # node => text
77
+ assert_equal("text", s.send(:_normalize, noko.css("#element")))
78
+ # nodes => text joined with space
79
+ assert_equal("text1 text2", s.send(:_normalize, noko.css(".e")))
80
+ # array => text joined with pipe
81
+ assert_equal("1|2", s.send(:_normalize, [1,2]))
82
+
83
+ #
84
+ # string cleanups
85
+ #
86
+
87
+ # untag
88
+ assert_equal("gub", s.send(:_normalize, "<tag>gub</tag>"))
89
+ # convert_accented_entities
90
+ assert_equal("a", s.send(:_normalize, "&aacute;"))
91
+ # unent
92
+ assert_equal("<>", s.send(:_normalize, "&lt;&gt;"))
93
+ # to_ascii
94
+ assert_equal("cafe", s.send(:_normalize, "caf\xc3\xa9"))
95
+ # squish
96
+ assert_equal("hello world", s.send(:_normalize, "\nhello \t \rworld"))
97
+ end
98
+ end
99
+ end
100
+
101
+
@@ -0,0 +1,19 @@
1
+ require "helper"
2
+
3
+ module Sinew
4
+ class TestNokogiriExt < TestCase
5
+ def setup
6
+ @noko = Nokogiri::HTML(HTML).css("#nokogiri_ext")
7
+ end
8
+
9
+ def test_inner_text
10
+ assert_equal("hello world", @noko.css("li").inner_text)
11
+ assert_equal("<li>hello</li> <li>world</li>", @noko.css("ul").inner_html.squish)
12
+ end
13
+
14
+ def test_just_me
15
+ assert_equal("a", @noko.css("div").text_just_me.squish)
16
+ assert_equal("b b", @noko.css("p").text_just_me.squish)
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,23 @@
1
+ require "helper"
2
+
3
+ module Sinew
4
+ class TestTextUtil < TestCase
5
+ def test_tidy
6
+ tidy = TextUtil.html_tidy(HTML)
7
+ # tags removed?
8
+ assert(tidy !~ /script|meta/)
9
+ # squished?
10
+ assert(tidy !~ / /)
11
+ # comments removed?
12
+ assert(tidy !~ /<!--/)
13
+ end
14
+
15
+ def test_clean
16
+ clean = TextUtil.html_clean(HTML)
17
+ # attributes removed
18
+ assert(clean !~ /will_be_removed/)
19
+ # attributes preserved
20
+ assert(clean =~ /will_be_preserved/)
21
+ end
22
+ end
23
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sinew
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-06-04 00:00:00.000000000 Z
12
+ date: 2012-06-07 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: activesupport
@@ -146,6 +146,12 @@ files:
146
146
  - lib/sinew/version.rb
147
147
  - sample.sinew
148
148
  - sinew.gemspec
149
+ - test/helper.rb
150
+ - test/test.html
151
+ - test/test_curler.rb
152
+ - test/test_main.rb
153
+ - test/test_nokogiri_ext.rb
154
+ - test/test_text_util.rb
149
155
  homepage: http://github.com/gurgeous/sinew
150
156
  licenses: []
151
157
  post_install_message:
@@ -160,7 +166,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
160
166
  version: '0'
161
167
  segments:
162
168
  - 0
163
- hash: 106543959769779396
169
+ hash: 2227650352747651089
164
170
  required_rubygems_version: !ruby/object:Gem::Requirement
165
171
  none: false
166
172
  requirements:
@@ -169,7 +175,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
169
175
  version: '0'
170
176
  segments:
171
177
  - 0
172
- hash: 106543959769779396
178
+ hash: 2227650352747651089
173
179
  requirements: []
174
180
  rubyforge_project: sinew
175
181
  rubygems_version: 1.8.21