sinew 1.0.3 → 1.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: c99e34e066e5c74889a842032d90dbe867e18ac6
4
+ data.tar.gz: 4cae139e85a63aecb0a5671a43fc8881219236df
5
+ SHA512:
6
+ metadata.gz: c341f22f607ff0ac03c3f2646d168b6776a21ab9f8b23875a99e9fae71d6cf972a6b22255ec176a3fad68b6d65d26f8cd6d2f34bdfb8a7b6d949bdb61deea6d2
7
+ data.tar.gz: 8314c4ab14180443050ddc74b828b5833610a86d93fb1e29e3bc2b320e0e6a12cf6b86b96c6a02bc9104f064fd242c2df7df11faf4d2237fa7756bb1865791c5
data/README.md CHANGED
@@ -12,20 +12,18 @@ gem install sinew
12
12
 
13
13
  ## Example
14
14
 
15
- Here's an example for collecting Amazon's bestseller list:
15
+ Here's an example for collecting the links from httpbin.org:
16
16
 
17
17
  ```ruby
18
18
  # get the url
19
- get "http://www.amazon.com/gp/bestsellers/books/ref=sv_b_3"
19
+ get "http://httpbin.org"
20
20
 
21
- # use nokogiri to find books
22
- noko.css(".zg_itemRow").each do |item|
23
- # pull out the stuff we care about using nokogiri
21
+ # use nokogiri to collect links
22
+ noko.css("ul li a").each do |a|
24
23
  row = { }
25
- row[:url] = item.css(".zg_title a").first[:href]
26
- row[:title] = item.css(".zg_title")
27
- row[:img] = item.css(".zg_itemImage_normal img").first[:src]
28
-
24
+ row[:url] = a[:href]
25
+ row[:title] = a.text
26
+
29
27
  # append a row to the csv
30
28
  csv_emit(row)
31
29
  end
@@ -33,6 +31,14 @@ end
33
31
 
34
32
  If you paste this into a file called `bestsellers.sinew` and run `sinew bestsellers.sinew`, it will create a `bestsellers.csv` file containing the url, title and img for each bestseller.
35
33
 
34
+ ## How does Sinew differ from Mechanize?
35
+
36
+ I'm not an expert on Mechanize, but this question has come up repeatedly and I'll try to address it. Mechanize is a great toolkit and it's better for some situations. Briefly:
37
+
38
+ * Sinew caches all HTTP requests on disk. That makes it possible to iterate quickly. Crawl once and then continue to work on your recipe. Run the recipe over and over while you tune your CSS selectors and regular expressions.
39
+ * Sinew runs responses through [HTML Tidy](http://tidy.sourceforge.net). This cleans up dirty HTML and makes it easier to parse in many cases, especially if you have to fallback to regular expressions instead of Nokogiri. Unfortunately, this is a common use case in my experience.
40
+ * Sinew outputs CSV files. It does exactly one thing and it does it well - Sinew crawls a site and outputs a CSV file. Mechanize is a more general toolkit.
41
+
36
42
  ## Full Documentation
37
43
 
38
44
  Full docs are in the wiki:
@@ -19,7 +19,7 @@ module Sinew
19
19
  def get(url, params = nil)
20
20
  _http(url, params, :get)
21
21
  end
22
-
22
+
23
23
  def post(url, params = nil)
24
24
  _http(url, params, :post)
25
25
  end
@@ -93,7 +93,7 @@ module Sinew
93
93
  s
94
94
  end
95
95
  $stderr.puts print.ai if @options[:verbose]
96
- @csv << row
96
+ @csv << row
97
97
  @csv.flush
98
98
  end
99
99
 
@@ -111,7 +111,7 @@ module Sinew
111
111
 
112
112
  def _run
113
113
  @csv = @path = nil
114
-
114
+
115
115
  file = @options[:file]
116
116
  if !File.exists?(file)
117
117
  Util.fatal("#{file} not found")
@@ -164,7 +164,7 @@ module Sinew
164
164
  @clean = nil
165
165
  @noko = nil
166
166
  end
167
-
167
+
168
168
  def _normalize(s, key = nil)
169
169
  case s
170
170
  when Nokogiri::XML::Element, Nokogiri::XML::NodeSet
@@ -175,7 +175,7 @@ module Sinew
175
175
  s = s.to_s
176
176
  end
177
177
  s = TextUtil.untag(s)
178
- s = s.convert_accented_entities
178
+ s = s.convert_accented_html_entities
179
179
  s = TextUtil.unent(s)
180
180
  s = s.to_ascii.squish
181
181
  s
@@ -1,4 +1,4 @@
1
1
  module Sinew
2
2
  # Gem version
3
- VERSION = "1.0.3"
3
+ VERSION = "1.0.4"
4
4
  end
@@ -1,8 +1,7 @@
1
- get "http://www.amazon.com/gp/bestsellers/books/ref=sv_b_3"
2
- noko.css(".zg_itemRow").each do |item|
1
+ get "http://httpbin.org"
2
+ noko.css("ul li a").each do |a|
3
3
  row = { }
4
- row[:url] = item.css(".zg_title a").first[:href]
5
- row[:title] = item.css(".zg_title")
6
- row[:img] = item.css(".zg_itemImage_normal img").first[:src]
4
+ row[:url] = a[:href]
5
+ row[:title] = a.text
7
6
  csv_emit(row)
8
7
  end
@@ -14,14 +14,14 @@ Gem::Specification.new do |s|
14
14
 
15
15
  s.rubyforge_project = "sinew"
16
16
 
17
- s.add_runtime_dependency "activesupport"
17
+ s.add_runtime_dependency "activesupport", "~> 3.0"
18
18
  s.add_runtime_dependency "awesome_print"
19
19
  s.add_runtime_dependency "htmlentities"
20
20
  s.add_runtime_dependency "nokogiri"
21
- s.add_runtime_dependency "stringex"
21
+ s.add_runtime_dependency "stringex", "~> 2.0"
22
22
  s.add_runtime_dependency "trollop"
23
23
  s.add_development_dependency "rake"
24
-
24
+
25
25
  s.files = `git ls-files`.split("\n")
26
26
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
27
27
  s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
metadata CHANGED
@@ -1,126 +1,111 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sinew
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.3
5
- prerelease:
4
+ version: 1.0.4
6
5
  platform: ruby
7
6
  authors:
8
7
  - Adam Doppelt
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2012-06-19 00:00:00.000000000 Z
11
+ date: 2013-11-10 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: activesupport
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
- - - ! '>='
17
+ - - ~>
20
18
  - !ruby/object:Gem::Version
21
- version: '0'
19
+ version: '3.0'
22
20
  type: :runtime
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
- - - ! '>='
24
+ - - ~>
28
25
  - !ruby/object:Gem::Version
29
- version: '0'
26
+ version: '3.0'
30
27
  - !ruby/object:Gem::Dependency
31
28
  name: awesome_print
32
29
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
30
  requirements:
35
- - - ! '>='
31
+ - - '>='
36
32
  - !ruby/object:Gem::Version
37
33
  version: '0'
38
34
  type: :runtime
39
35
  prerelease: false
40
36
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
37
  requirements:
43
- - - ! '>='
38
+ - - '>='
44
39
  - !ruby/object:Gem::Version
45
40
  version: '0'
46
41
  - !ruby/object:Gem::Dependency
47
42
  name: htmlentities
48
43
  requirement: !ruby/object:Gem::Requirement
49
- none: false
50
44
  requirements:
51
- - - ! '>='
45
+ - - '>='
52
46
  - !ruby/object:Gem::Version
53
47
  version: '0'
54
48
  type: :runtime
55
49
  prerelease: false
56
50
  version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
51
  requirements:
59
- - - ! '>='
52
+ - - '>='
60
53
  - !ruby/object:Gem::Version
61
54
  version: '0'
62
55
  - !ruby/object:Gem::Dependency
63
56
  name: nokogiri
64
57
  requirement: !ruby/object:Gem::Requirement
65
- none: false
66
58
  requirements:
67
- - - ! '>='
59
+ - - '>='
68
60
  - !ruby/object:Gem::Version
69
61
  version: '0'
70
62
  type: :runtime
71
63
  prerelease: false
72
64
  version_requirements: !ruby/object:Gem::Requirement
73
- none: false
74
65
  requirements:
75
- - - ! '>='
66
+ - - '>='
76
67
  - !ruby/object:Gem::Version
77
68
  version: '0'
78
69
  - !ruby/object:Gem::Dependency
79
70
  name: stringex
80
71
  requirement: !ruby/object:Gem::Requirement
81
- none: false
82
72
  requirements:
83
- - - ! '>='
73
+ - - ~>
84
74
  - !ruby/object:Gem::Version
85
- version: '0'
75
+ version: '2.0'
86
76
  type: :runtime
87
77
  prerelease: false
88
78
  version_requirements: !ruby/object:Gem::Requirement
89
- none: false
90
79
  requirements:
91
- - - ! '>='
80
+ - - ~>
92
81
  - !ruby/object:Gem::Version
93
- version: '0'
82
+ version: '2.0'
94
83
  - !ruby/object:Gem::Dependency
95
84
  name: trollop
96
85
  requirement: !ruby/object:Gem::Requirement
97
- none: false
98
86
  requirements:
99
- - - ! '>='
87
+ - - '>='
100
88
  - !ruby/object:Gem::Version
101
89
  version: '0'
102
90
  type: :runtime
103
91
  prerelease: false
104
92
  version_requirements: !ruby/object:Gem::Requirement
105
- none: false
106
93
  requirements:
107
- - - ! '>='
94
+ - - '>='
108
95
  - !ruby/object:Gem::Version
109
96
  version: '0'
110
97
  - !ruby/object:Gem::Dependency
111
98
  name: rake
112
99
  requirement: !ruby/object:Gem::Requirement
113
- none: false
114
100
  requirements:
115
- - - ! '>='
101
+ - - '>='
116
102
  - !ruby/object:Gem::Version
117
103
  version: '0'
118
104
  type: :development
119
105
  prerelease: false
120
106
  version_requirements: !ruby/object:Gem::Requirement
121
- none: false
122
107
  requirements:
123
- - - ! '>='
108
+ - - '>='
124
109
  - !ruby/object:Gem::Version
125
110
  version: '0'
126
111
  description: Crawl web sites easily using ruby recipes, with caching and nokogiri.
@@ -154,32 +139,31 @@ files:
154
139
  - test/test_text_util.rb
155
140
  homepage: http://github.com/gurgeous/sinew
156
141
  licenses: []
142
+ metadata: {}
157
143
  post_install_message:
158
144
  rdoc_options: []
159
145
  require_paths:
160
146
  - lib
161
147
  required_ruby_version: !ruby/object:Gem::Requirement
162
- none: false
163
148
  requirements:
164
- - - ! '>='
149
+ - - '>='
165
150
  - !ruby/object:Gem::Version
166
151
  version: '0'
167
- segments:
168
- - 0
169
- hash: -1578176275991755695
170
152
  required_rubygems_version: !ruby/object:Gem::Requirement
171
- none: false
172
153
  requirements:
173
- - - ! '>='
154
+ - - '>='
174
155
  - !ruby/object:Gem::Version
175
156
  version: '0'
176
- segments:
177
- - 0
178
- hash: -1578176275991755695
179
157
  requirements: []
180
158
  rubyforge_project: sinew
181
- rubygems_version: 1.8.21
159
+ rubygems_version: 2.0.6
182
160
  signing_key:
183
- specification_version: 3
161
+ specification_version: 4
184
162
  summary: Sinew - structured web crawling using recipes.
185
- test_files: []
163
+ test_files:
164
+ - test/helper.rb
165
+ - test/test.html
166
+ - test/test_curler.rb
167
+ - test/test_main.rb
168
+ - test/test_nokogiri_ext.rb
169
+ - test/test_text_util.rb