sinew 1.0.0 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -1,6 +1,7 @@
1
1
  require "bundler"
2
2
  require "bundler/setup"
3
3
  require "rake"
4
+ require "rake/testtask"
4
5
 
5
6
  $LOAD_PATH << File.expand_path("../lib", __FILE__)
6
7
  require "sinew/version"
@@ -24,7 +25,15 @@ task :release => :build do
24
25
  system "gem push sinew-#{Sinew::VERSION}.gem"
25
26
  end
26
27
 
27
- task :default => :gem
28
+ #
29
+ # minitest
30
+ #
31
+
32
+ Rake::TestTask.new(:test) do |test|
33
+ test.libs << "test"
34
+ end
35
+
36
+ task :default => :test
28
37
 
29
38
  # to test:
30
39
  # block ; rake install && rm -rf ~/.sinew/www.amazon.com && /usr/local/bin/sinew sample.sinew
data/bin/sinew CHANGED
@@ -3,10 +3,14 @@
3
3
  require "sinew"
4
4
  require "trollop"
5
5
 
6
+ raise "Sinew requires Ruby 1.9 or higher" if RUBY_VERSION < "1.9"
7
+
6
8
  # ARGV
7
9
  options = Trollop.options do
8
10
  banner "Usage: sinew [options] <gub.sinew>"
11
+ opt :cache, "Set the cache directory (defaults to ~/.sinew)"
9
12
  opt :verbose, "Dump every row"
13
+ opt :quiet, "Be quiet"
10
14
  end
11
15
  Trollop.die "need a .sinew file to run against" if ARGV.blank?
12
16
 
data/lib/sinew/main.rb CHANGED
@@ -13,73 +13,17 @@ module Sinew
13
13
 
14
14
  def initialize(options)
15
15
  @options = options.dup
16
- @csv = @path = nil
17
-
18
- @curler = Curler.new(user_agent: "sinew/#{VERSION}")
19
-
20
- file = @options[:file]
21
- if !File.exists?(file)
22
- Util.fatal("#{file} not found")
23
- end
24
-
25
- tm = Time.now
26
- instance_eval(File.read(file, mode: "rb"), file)
27
- if @path
28
- Util.banner("Finished #{@path} in #{(Time.now - tm).to_i}s.")
29
- else
30
- Util.banner("Finished in #{(Time.now - tm).to_i}s.")
31
- end
16
+ _run if !@options[:test]
32
17
  end
33
18
 
34
19
  def get(url, params = nil)
35
- http(url, params, :get)
20
+ _http(url, params, :get)
36
21
  end
37
22
 
38
23
  def post(url, params = nil)
39
- http(url, params, :post)
24
+ _http(url, params, :post)
40
25
  end
41
26
 
42
- def http(url, params, method)
43
- url = url.to_s
44
- raise "invalid url #{url.inspect}" if url !~ /^http/i
45
-
46
- # decode entities
47
- url = CODER.decode(url)
48
-
49
- # handle params
50
- body = nil
51
- if params
52
- q = params.map { |key, value| [CGI.escape(key.to_s), CGI.escape(value.to_s)] }.sort
53
- q = q.map { |key, value| "#{key}=#{value}" }.join("&")
54
- if method == :get
55
- separator = url.include?(??) ? "&" : "?"
56
- url = "#{url}#{separator}#{q}"
57
- else
58
- body = q
59
- end
60
- end
61
-
62
- begin
63
- if method == :get
64
- path = @curler.get(url)
65
- else
66
- path = @curler.post(url, body)
67
- end
68
- @raw = File.read(path, mode: "rb")
69
- rescue Curler::Error => e
70
- $stderr.puts "xxx #{e.message}"
71
- @raw = ""
72
- end
73
-
74
- # setup local variables
75
- @url, @uri = @curler.url, @curler.uri
76
- @html = nil
77
- @clean = nil
78
- @noko = nil
79
-
80
- nil
81
- end
82
-
83
27
  #
84
28
  # lazy accessors for cleaned up version
85
29
  #
@@ -136,10 +80,92 @@ module Sinew
136
80
  @csv = CSV.open(file, "wb")
137
81
  @csv_keys = args
138
82
  @csv << @csv_keys
139
- Util.banner("Writing to #{@path}...")
83
+ _banner("Writing to #{@path}...")
140
84
  end
141
85
 
142
- def normalize(key, s)
86
+ def csv_emit(row, options = {})
87
+ csv_header(row.keys.sort) if !@csv
88
+
89
+ print = { }
90
+ row = @csv_keys.map do |i|
91
+ s = _normalize(row[i], i)
92
+ print[i] = s if !s.empty?
93
+ s
94
+ end
95
+ $stderr.puts print.ai if @options[:verbose]
96
+ @csv << row
97
+ @csv.flush
98
+ end
99
+
100
+ protected
101
+
102
+ def _curler
103
+ @curler ||= begin
104
+ # curler
105
+ options = { user_agent: "sinew/#{VERSION}" }
106
+ options[:dir] = @options[:cache] if @options[:cache]
107
+ options[:verbose] = false if @options[:quiet]
108
+ Curler.new(options)
109
+ end
110
+ end
111
+
112
+ def _run
113
+ @csv = @path = nil
114
+
115
+ file = @options[:file]
116
+ if !File.exists?(file)
117
+ Util.fatal("#{file} not found")
118
+ end
119
+
120
+ tm = Time.now
121
+ instance_eval(File.read(file, mode: "rb"), file)
122
+ if @path
123
+ _banner("Finished #{@path} in #{(Time.now - tm).to_i}s.")
124
+ else
125
+ _banner("Finished in #{(Time.now - tm).to_i}s.")
126
+ end
127
+ end
128
+
129
+ def _http(url, params, method)
130
+ url = url.to_s
131
+ raise "invalid url #{url.inspect}" if url !~ /^http/i
132
+
133
+ # decode entities
134
+ url = CODER.decode(url)
135
+
136
+ # handle params
137
+ body = nil
138
+ if params
139
+ q = params.map { |key, value| [CGI.escape(key.to_s), CGI.escape(value.to_s)] }.sort
140
+ q = q.map { |key, value| "#{key}=#{value}" }.join("&")
141
+ if method == :get
142
+ separator = url.include?(??) ? "&" : "?"
143
+ url = "#{url}#{separator}#{q}"
144
+ else
145
+ body = q
146
+ end
147
+ end
148
+
149
+ begin
150
+ if method == :get
151
+ path = _curler.get(url)
152
+ else
153
+ path = _curler.post(url, body)
154
+ end
155
+ @raw = File.read(path, mode: "rb")
156
+ rescue Curler::Error => e
157
+ $stderr.puts "xxx #{e.message}"
158
+ @raw = ""
159
+ end
160
+
161
+ # setup local variables
162
+ @url, @uri = _curler.url, _curler.uri
163
+ @html = nil
164
+ @clean = nil
165
+ @noko = nil
166
+ end
167
+
168
+ def _normalize(s, key = nil)
143
169
  case s
144
170
  when Nokogiri::XML::Element, Nokogiri::XML::NodeSet
145
171
  s = s.inner_html
@@ -155,18 +181,8 @@ module Sinew
155
181
  s
156
182
  end
157
183
 
158
- def csv_emit(row, options = {})
159
- csv_header(row.keys.sort) if !@csv
160
-
161
- print = { }
162
- row = @csv_keys.map do |i|
163
- s = normalize(i, row[i])
164
- print[i] = s if !s.empty?
165
- s
166
- end
167
- $stderr.puts print.ai if @options[:verbose]
168
- @csv << row
169
- @csv.flush
184
+ def _banner(s)
185
+ Util.banner(s) if !@options[:quiet]
170
186
  end
171
187
  end
172
188
  end
data/lib/sinew/version.rb CHANGED
@@ -1,4 +1,4 @@
1
1
  module Sinew
2
2
  # Gem version
3
- VERSION = "1.0.0"
3
+ VERSION = "1.0.1"
4
4
  end
data/test/helper.rb ADDED
@@ -0,0 +1,64 @@
1
+ require "active_support/core_ext"
2
+ require "test/unit"
3
+ require "sinew"
4
+
5
+ module Sinew
6
+ class TestCase < Test::Unit::TestCase
7
+ TMP = "/tmp/_test_sinew"
8
+ HTML_FILE = File.expand_path("#{File.dirname(__FILE__)}/test.html")
9
+ HTML = File.read(HTML_FILE)
10
+
11
+ #
12
+ # for mocking curl
13
+ #
14
+
15
+ def mock_curl_200
16
+ Proc.new do |cmd, args|
17
+ mock_curl(args, HTML, "HTTP/1.1 200 OK")
18
+ end
19
+ end
20
+
21
+ def mock_curl_302
22
+ Proc.new do |cmd, args|
23
+ mock_curl(args, "", "HTTP/1.1 302 Moved Temporarily\r\nLocation: http://www.gub.com")
24
+ end
25
+ end
26
+
27
+ def mock_curl_500
28
+ Proc.new do |cmd, args|
29
+ raise Util::RunError, "curl error"
30
+ end
31
+ end
32
+
33
+ def mock_curl(args, body, head)
34
+ File.write(args[args.index("--output") + 1], body)
35
+ File.write(args[args.index("--dump-header") + 1], "#{head}\r\n\r\n")
36
+ end
37
+ end
38
+ end
39
+
40
+ #
41
+ # from MiniTest, but not in the gem yet
42
+ #
43
+
44
+ class Object
45
+ def stub name, val_or_callable, &block
46
+ new_name = "__minitest_stub__#{name}"
47
+
48
+ metaclass = class << self; self; end
49
+ metaclass.send :alias_method, new_name, name
50
+ metaclass.send :define_method, name do |*args|
51
+ if val_or_callable.respond_to? :call then
52
+ val_or_callable.call(*args)
53
+ else
54
+ val_or_callable
55
+ end
56
+ end
57
+
58
+ yield
59
+ ensure
60
+ metaclass.send :undef_method, name
61
+ metaclass.send :alias_method, name, new_name
62
+ metaclass.send :undef_method, new_name
63
+ end
64
+ end
data/test/test.html ADDED
@@ -0,0 +1,39 @@
1
+ <html>
2
+ <head>
3
+ <title>Title</title>
4
+ <script>
5
+ alert("alert 1");
6
+ alert("alert 2");
7
+ </script>
8
+ </head>
9
+
10
+ <body>
11
+ <div id="main">
12
+ <span class="class1"> text1 </span>
13
+ <span class="class2"> text2 </span>
14
+
15
+ <!-- for test_normalize -->
16
+ <div id="element">text</div>
17
+ <div class="e">text1</div>
18
+ <div class="e">text2</div>
19
+ </div>
20
+
21
+ <div id="nokogiri_ext">
22
+ <ul>
23
+ <li>hello</li>
24
+ <li>world</li>
25
+ </ul>
26
+ <div>
27
+ a
28
+ <p>b<span>c</span></p>
29
+ <p>b<span>c</span></p>
30
+ </div>
31
+ </div>
32
+
33
+ <div id="text_util">
34
+ <!-- a comment that should be removed -->
35
+ <div class="will_be_removed"/>
36
+ <a class="will_be_preserved"/>
37
+ </div>
38
+ </body>
39
+ </html>
@@ -0,0 +1,70 @@
1
+ require "helper"
2
+
3
+ module Sinew
4
+ class TestCurler < TestCase
5
+ def setup
6
+ # create TMP dir
7
+ FileUtils.rm_rf(TMP) if File.exists?(TMP)
8
+ FileUtils.mkdir_p(TMP)
9
+
10
+ # curler, pointed at TMP
11
+ @curler = Curler.new(dir: TMP, verbose: false)
12
+ end
13
+
14
+ #
15
+ # tests
16
+ #
17
+
18
+ def test_200
19
+ Util.stub(:run, mock_curl_200) do
20
+ path = @curler.get("http://www.example.com")
21
+ assert_equal(HTML, File.read(path))
22
+ end
23
+ end
24
+
25
+ def test_500
26
+ assert_raises(Curler::Error) do
27
+ Util.stub(:run, mock_curl_500) do
28
+ @curler.get("http://www.example.com")
29
+ end
30
+ end
31
+ end
32
+
33
+ def test_cached
34
+ Util.stub(:run, mock_curl_200) do
35
+ assert_equal(HTML, File.read(@curler.get("http://www.example.com")))
36
+ end
37
+ # the file is cached, so this shouldn't produce an error
38
+ Util.stub(:run, mock_curl_500) do
39
+ @curler.get("http://www.example.com")
40
+ end
41
+ end
42
+
43
+ def test_302
44
+ Util.stub(:run, mock_curl_302) do
45
+ @curler.get("http://www.example.com")
46
+ assert_equal("http://www.gub.com", @curler.url)
47
+ end
48
+ end
49
+
50
+ def test_rate_limit
51
+ slept = false
52
+
53
+ # change Kernel#sleep to not really sleep!
54
+ Kernel.send(:alias_method, :old_sleep, :sleep)
55
+ Kernel.send(:define_method, :sleep) do |x|
56
+ slept = true
57
+ end
58
+
59
+ Util.stub(:run, mock_curl_200) do
60
+ @curler.get("http://www.example.com/1")
61
+ @curler.get("http://www.example.com/2")
62
+ end
63
+ assert(slept)
64
+
65
+ # restore old Kernel#sleep
66
+ Kernel.send(:alias_method, :sleep, :old_sleep)
67
+ Kernel.send(:undef_method, :old_sleep)
68
+ end
69
+ end
70
+ end
data/test/test_main.rb ADDED
@@ -0,0 +1,101 @@
1
+ # encoding: UTF-8
2
+
3
+ require "helper"
4
+
5
+ module Sinew
6
+ class TestMain < TestCase
7
+ RECIPE = "#{TMP}/test.sinew"
8
+ CSV = "#{TMP}/test.csv"
9
+
10
+ def setup
11
+ # create TMP dir
12
+ FileUtils.rm_rf(TMP) if File.exists?(TMP)
13
+ FileUtils.mkdir_p(TMP)
14
+ end
15
+
16
+ def run_recipe(recipe)
17
+ File.write(RECIPE, recipe)
18
+ Util.stub(:run, mock_curl_200) do
19
+ Sinew::Main.new(cache: TMP, file: RECIPE, quiet: true)
20
+ end
21
+ end
22
+
23
+ def test_noko
24
+ run_recipe <<'EOF'
25
+ get "http://www.example.com"
26
+ csv_header(:class, :text)
27
+ noko.css("#main span").each do |span|
28
+ csv_emit(class: span[:class], text: span.text)
29
+ end
30
+ EOF
31
+ assert_equal("class,text\nclass1,text1\nclass2,text2\n", File.read(CSV))
32
+ end
33
+
34
+ def test_raw
35
+ # test javascript, which is only crawlable with raw
36
+ run_recipe <<'EOF'
37
+ get "http://www.example.com"
38
+ raw.scan(/alert\("([^"]+)/) do
39
+ csv_emit(alert: $1)
40
+ end
41
+ EOF
42
+ assert_equal("alert\nalert 1\nalert 2\n", File.read(CSV))
43
+ end
44
+
45
+ def test_html
46
+ # note the cleaned up whitespace
47
+ run_recipe <<'EOF'
48
+ get "http://www.example.com"
49
+ csv_header(:class, :text)
50
+ html.scan(/<span class="(\w+)">(\w+)/) do
51
+ csv_emit(class: $1, text: $2)
52
+ end
53
+ EOF
54
+ assert_equal("class,text\nclass1,text1\nclass2,text2\n", File.read(CSV))
55
+ end
56
+
57
+ def test_clean
58
+ # note the removed attributes from span
59
+ run_recipe <<'EOF'
60
+ get "http://www.example.com"
61
+ clean.scan(/<span>(text\d)/) do
62
+ csv_emit(text: $1)
63
+ end
64
+ EOF
65
+ assert_equal("text\ntext1\ntext2\n", File.read(CSV))
66
+ end
67
+
68
+ def test_normalize
69
+ s = Sinew::Main.new(test: true)
70
+
71
+ #
72
+ # non-strings
73
+ #
74
+
75
+ noko = Nokogiri::HTML(HTML).css("#main")
76
+ # node => text
77
+ assert_equal("text", s.send(:_normalize, noko.css("#element")))
78
+ # nodes => text joined with space
79
+ assert_equal("text1 text2", s.send(:_normalize, noko.css(".e")))
80
+ # array => text joined with pipe
81
+ assert_equal("1|2", s.send(:_normalize, [1,2]))
82
+
83
+ #
84
+ # string cleanups
85
+ #
86
+
87
+ # untag
88
+ assert_equal("gub", s.send(:_normalize, "<tag>gub</tag>"))
89
+ # convert_accented_entities
90
+ assert_equal("a", s.send(:_normalize, "&aacute;"))
91
+ # unent
92
+ assert_equal("<>", s.send(:_normalize, "&lt;&gt;"))
93
+ # to_ascii
94
+ assert_equal("cafe", s.send(:_normalize, "caf\xc3\xa9"))
95
+ # squish
96
+ assert_equal("hello world", s.send(:_normalize, "\nhello \t \rworld"))
97
+ end
98
+ end
99
+ end
100
+
101
+
@@ -0,0 +1,19 @@
1
+ require "helper"
2
+
3
+ module Sinew
4
+ class TestNokogiriExt < TestCase
5
+ def setup
6
+ @noko = Nokogiri::HTML(HTML).css("#nokogiri_ext")
7
+ end
8
+
9
+ def test_inner_text
10
+ assert_equal("hello world", @noko.css("li").inner_text)
11
+ assert_equal("<li>hello</li> <li>world</li>", @noko.css("ul").inner_html.squish)
12
+ end
13
+
14
+ def test_just_me
15
+ assert_equal("a", @noko.css("div").text_just_me.squish)
16
+ assert_equal("b b", @noko.css("p").text_just_me.squish)
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,23 @@
1
+ require "helper"
2
+
3
+ module Sinew
4
+ class TestTextUtil < TestCase
5
+ def test_tidy
6
+ tidy = TextUtil.html_tidy(HTML)
7
+ # tags removed?
8
+ assert(tidy !~ /script|meta/)
9
+ # squished?
10
+ assert(tidy !~ / /)
11
+ # comments removed?
12
+ assert(tidy !~ /<!--/)
13
+ end
14
+
15
+ def test_clean
16
+ clean = TextUtil.html_clean(HTML)
17
+ # attributes removed
18
+ assert(clean !~ /will_be_removed/)
19
+ # attributes preserved
20
+ assert(clean =~ /will_be_preserved/)
21
+ end
22
+ end
23
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sinew
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-06-04 00:00:00.000000000 Z
12
+ date: 2012-06-07 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: activesupport
@@ -146,6 +146,12 @@ files:
146
146
  - lib/sinew/version.rb
147
147
  - sample.sinew
148
148
  - sinew.gemspec
149
+ - test/helper.rb
150
+ - test/test.html
151
+ - test/test_curler.rb
152
+ - test/test_main.rb
153
+ - test/test_nokogiri_ext.rb
154
+ - test/test_text_util.rb
149
155
  homepage: http://github.com/gurgeous/sinew
150
156
  licenses: []
151
157
  post_install_message:
@@ -160,7 +166,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
160
166
  version: '0'
161
167
  segments:
162
168
  - 0
163
- hash: 106543959769779396
169
+ hash: 2227650352747651089
164
170
  required_rubygems_version: !ruby/object:Gem::Requirement
165
171
  none: false
166
172
  requirements:
@@ -169,7 +175,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
169
175
  version: '0'
170
176
  segments:
171
177
  - 0
172
- hash: 106543959769779396
178
+ hash: 2227650352747651089
173
179
  requirements: []
174
180
  rubyforge_project: sinew
175
181
  rubygems_version: 1.8.21