sinew 1.0.3 → 1.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +15 -9
- data/lib/sinew/main.rb +5 -5
- data/lib/sinew/version.rb +1 -1
- data/sample.sinew +4 -5
- data/sinew.gemspec +3 -3
- metadata +32 -48
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: c99e34e066e5c74889a842032d90dbe867e18ac6
|
4
|
+
data.tar.gz: 4cae139e85a63aecb0a5671a43fc8881219236df
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: c341f22f607ff0ac03c3f2646d168b6776a21ab9f8b23875a99e9fae71d6cf972a6b22255ec176a3fad68b6d65d26f8cd6d2f34bdfb8a7b6d949bdb61deea6d2
|
7
|
+
data.tar.gz: 8314c4ab14180443050ddc74b828b5833610a86d93fb1e29e3bc2b320e0e6a12cf6b86b96c6a02bc9104f064fd242c2df7df11faf4d2237fa7756bb1865791c5
|
data/README.md
CHANGED
@@ -12,20 +12,18 @@ gem install sinew
|
|
12
12
|
|
13
13
|
## Example
|
14
14
|
|
15
|
-
Here's an example for collecting
|
15
|
+
Here's an example for collecting the links from httpbin.org:
|
16
16
|
|
17
17
|
```ruby
|
18
18
|
# get the url
|
19
|
-
get "http://
|
19
|
+
get "http://httpbin.org"
|
20
20
|
|
21
|
-
# use nokogiri to
|
22
|
-
noko.css("
|
23
|
-
# pull out the stuff we care about using nokogiri
|
21
|
+
# use nokogiri to collect links
|
22
|
+
noko.css("ul li a").each do |a|
|
24
23
|
row = { }
|
25
|
-
row[:url] =
|
26
|
-
row[:title] =
|
27
|
-
|
28
|
-
|
24
|
+
row[:url] = a[:href]
|
25
|
+
row[:title] = a.text
|
26
|
+
|
29
27
|
# append a row to the csv
|
30
28
|
csv_emit(row)
|
31
29
|
end
|
@@ -33,6 +31,14 @@ end
|
|
33
31
|
|
34
32
|
If you paste this into a file called `bestsellers.sinew` and run `sinew bestsellers.sinew`, it will create a `bestsellers.csv` file containing the url, title and img for each bestseller.
|
35
33
|
|
34
|
+
## How does Sinew differ from Mechanize?
|
35
|
+
|
36
|
+
I'm not an expert on Mechanize, but this question has come up repeatedly and I'll try to address it. Mechanize is a great toolkit and it's better for some situations. Briefly:
|
37
|
+
|
38
|
+
* Sinew caches all HTTP requests on disk. That makes it possible to iterate quickly. Crawl once and then continue to work on your recipe. Run the recipe over and over while you tune your CSS selectors and regular expressions.
|
39
|
+
* Sinew runs responses through [HTML Tidy](http://tidy.sourceforge.net). This cleans up dirty HTML and makes it easier to parse in many cases, especially if you have to fallback to regular expressions instead of Nokogiri. Unfortunately, this is a common use case in my experience.
|
40
|
+
* Sinew outputs CSV files. It does exactly one thing and it does it well - Sinew crawls a site and outputs a CSV file. Mechanize is a more general toolkit.
|
41
|
+
|
36
42
|
## Full Documentation
|
37
43
|
|
38
44
|
Full docs are in the wiki:
|
data/lib/sinew/main.rb
CHANGED
@@ -19,7 +19,7 @@ module Sinew
|
|
19
19
|
def get(url, params = nil)
|
20
20
|
_http(url, params, :get)
|
21
21
|
end
|
22
|
-
|
22
|
+
|
23
23
|
def post(url, params = nil)
|
24
24
|
_http(url, params, :post)
|
25
25
|
end
|
@@ -93,7 +93,7 @@ module Sinew
|
|
93
93
|
s
|
94
94
|
end
|
95
95
|
$stderr.puts print.ai if @options[:verbose]
|
96
|
-
@csv << row
|
96
|
+
@csv << row
|
97
97
|
@csv.flush
|
98
98
|
end
|
99
99
|
|
@@ -111,7 +111,7 @@ module Sinew
|
|
111
111
|
|
112
112
|
def _run
|
113
113
|
@csv = @path = nil
|
114
|
-
|
114
|
+
|
115
115
|
file = @options[:file]
|
116
116
|
if !File.exists?(file)
|
117
117
|
Util.fatal("#{file} not found")
|
@@ -164,7 +164,7 @@ module Sinew
|
|
164
164
|
@clean = nil
|
165
165
|
@noko = nil
|
166
166
|
end
|
167
|
-
|
167
|
+
|
168
168
|
def _normalize(s, key = nil)
|
169
169
|
case s
|
170
170
|
when Nokogiri::XML::Element, Nokogiri::XML::NodeSet
|
@@ -175,7 +175,7 @@ module Sinew
|
|
175
175
|
s = s.to_s
|
176
176
|
end
|
177
177
|
s = TextUtil.untag(s)
|
178
|
-
s = s.
|
178
|
+
s = s.convert_accented_html_entities
|
179
179
|
s = TextUtil.unent(s)
|
180
180
|
s = s.to_ascii.squish
|
181
181
|
s
|
data/lib/sinew/version.rb
CHANGED
data/sample.sinew
CHANGED
@@ -1,8 +1,7 @@
|
|
1
|
-
get "http://
|
2
|
-
noko.css("
|
1
|
+
get "http://httpbin.org"
|
2
|
+
noko.css("ul li a").each do |a|
|
3
3
|
row = { }
|
4
|
-
row[:url] =
|
5
|
-
row[:title] =
|
6
|
-
row[:img] = item.css(".zg_itemImage_normal img").first[:src]
|
4
|
+
row[:url] = a[:href]
|
5
|
+
row[:title] = a.text
|
7
6
|
csv_emit(row)
|
8
7
|
end
|
data/sinew.gemspec
CHANGED
@@ -14,14 +14,14 @@ Gem::Specification.new do |s|
|
|
14
14
|
|
15
15
|
s.rubyforge_project = "sinew"
|
16
16
|
|
17
|
-
s.add_runtime_dependency "activesupport"
|
17
|
+
s.add_runtime_dependency "activesupport", "~> 3.0"
|
18
18
|
s.add_runtime_dependency "awesome_print"
|
19
19
|
s.add_runtime_dependency "htmlentities"
|
20
20
|
s.add_runtime_dependency "nokogiri"
|
21
|
-
s.add_runtime_dependency "stringex"
|
21
|
+
s.add_runtime_dependency "stringex", "~> 2.0"
|
22
22
|
s.add_runtime_dependency "trollop"
|
23
23
|
s.add_development_dependency "rake"
|
24
|
-
|
24
|
+
|
25
25
|
s.files = `git ls-files`.split("\n")
|
26
26
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
27
27
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
metadata
CHANGED
@@ -1,126 +1,111 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sinew
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
5
|
-
prerelease:
|
4
|
+
version: 1.0.4
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Adam Doppelt
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2013-11-10 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: activesupport
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
|
-
- -
|
17
|
+
- - ~>
|
20
18
|
- !ruby/object:Gem::Version
|
21
|
-
version: '0'
|
19
|
+
version: '3.0'
|
22
20
|
type: :runtime
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
|
-
- -
|
24
|
+
- - ~>
|
28
25
|
- !ruby/object:Gem::Version
|
29
|
-
version: '0'
|
26
|
+
version: '3.0'
|
30
27
|
- !ruby/object:Gem::Dependency
|
31
28
|
name: awesome_print
|
32
29
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
30
|
requirements:
|
35
|
-
- -
|
31
|
+
- - '>='
|
36
32
|
- !ruby/object:Gem::Version
|
37
33
|
version: '0'
|
38
34
|
type: :runtime
|
39
35
|
prerelease: false
|
40
36
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
37
|
requirements:
|
43
|
-
- -
|
38
|
+
- - '>='
|
44
39
|
- !ruby/object:Gem::Version
|
45
40
|
version: '0'
|
46
41
|
- !ruby/object:Gem::Dependency
|
47
42
|
name: htmlentities
|
48
43
|
requirement: !ruby/object:Gem::Requirement
|
49
|
-
none: false
|
50
44
|
requirements:
|
51
|
-
- -
|
45
|
+
- - '>='
|
52
46
|
- !ruby/object:Gem::Version
|
53
47
|
version: '0'
|
54
48
|
type: :runtime
|
55
49
|
prerelease: false
|
56
50
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
51
|
requirements:
|
59
|
-
- -
|
52
|
+
- - '>='
|
60
53
|
- !ruby/object:Gem::Version
|
61
54
|
version: '0'
|
62
55
|
- !ruby/object:Gem::Dependency
|
63
56
|
name: nokogiri
|
64
57
|
requirement: !ruby/object:Gem::Requirement
|
65
|
-
none: false
|
66
58
|
requirements:
|
67
|
-
- -
|
59
|
+
- - '>='
|
68
60
|
- !ruby/object:Gem::Version
|
69
61
|
version: '0'
|
70
62
|
type: :runtime
|
71
63
|
prerelease: false
|
72
64
|
version_requirements: !ruby/object:Gem::Requirement
|
73
|
-
none: false
|
74
65
|
requirements:
|
75
|
-
- -
|
66
|
+
- - '>='
|
76
67
|
- !ruby/object:Gem::Version
|
77
68
|
version: '0'
|
78
69
|
- !ruby/object:Gem::Dependency
|
79
70
|
name: stringex
|
80
71
|
requirement: !ruby/object:Gem::Requirement
|
81
|
-
none: false
|
82
72
|
requirements:
|
83
|
-
- -
|
73
|
+
- - ~>
|
84
74
|
- !ruby/object:Gem::Version
|
85
|
-
version: '0'
|
75
|
+
version: '2.0'
|
86
76
|
type: :runtime
|
87
77
|
prerelease: false
|
88
78
|
version_requirements: !ruby/object:Gem::Requirement
|
89
|
-
none: false
|
90
79
|
requirements:
|
91
|
-
- -
|
80
|
+
- - ~>
|
92
81
|
- !ruby/object:Gem::Version
|
93
|
-
version: '0'
|
82
|
+
version: '2.0'
|
94
83
|
- !ruby/object:Gem::Dependency
|
95
84
|
name: trollop
|
96
85
|
requirement: !ruby/object:Gem::Requirement
|
97
|
-
none: false
|
98
86
|
requirements:
|
99
|
-
- -
|
87
|
+
- - '>='
|
100
88
|
- !ruby/object:Gem::Version
|
101
89
|
version: '0'
|
102
90
|
type: :runtime
|
103
91
|
prerelease: false
|
104
92
|
version_requirements: !ruby/object:Gem::Requirement
|
105
|
-
none: false
|
106
93
|
requirements:
|
107
|
-
- -
|
94
|
+
- - '>='
|
108
95
|
- !ruby/object:Gem::Version
|
109
96
|
version: '0'
|
110
97
|
- !ruby/object:Gem::Dependency
|
111
98
|
name: rake
|
112
99
|
requirement: !ruby/object:Gem::Requirement
|
113
|
-
none: false
|
114
100
|
requirements:
|
115
|
-
- -
|
101
|
+
- - '>='
|
116
102
|
- !ruby/object:Gem::Version
|
117
103
|
version: '0'
|
118
104
|
type: :development
|
119
105
|
prerelease: false
|
120
106
|
version_requirements: !ruby/object:Gem::Requirement
|
121
|
-
none: false
|
122
107
|
requirements:
|
123
|
-
- -
|
108
|
+
- - '>='
|
124
109
|
- !ruby/object:Gem::Version
|
125
110
|
version: '0'
|
126
111
|
description: Crawl web sites easily using ruby recipes, with caching and nokogiri.
|
@@ -154,32 +139,31 @@ files:
|
|
154
139
|
- test/test_text_util.rb
|
155
140
|
homepage: http://github.com/gurgeous/sinew
|
156
141
|
licenses: []
|
142
|
+
metadata: {}
|
157
143
|
post_install_message:
|
158
144
|
rdoc_options: []
|
159
145
|
require_paths:
|
160
146
|
- lib
|
161
147
|
required_ruby_version: !ruby/object:Gem::Requirement
|
162
|
-
none: false
|
163
148
|
requirements:
|
164
|
-
- -
|
149
|
+
- - '>='
|
165
150
|
- !ruby/object:Gem::Version
|
166
151
|
version: '0'
|
167
|
-
segments:
|
168
|
-
- 0
|
169
|
-
hash: -1578176275991755695
|
170
152
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
171
|
-
none: false
|
172
153
|
requirements:
|
173
|
-
- -
|
154
|
+
- - '>='
|
174
155
|
- !ruby/object:Gem::Version
|
175
156
|
version: '0'
|
176
|
-
segments:
|
177
|
-
- 0
|
178
|
-
hash: -1578176275991755695
|
179
157
|
requirements: []
|
180
158
|
rubyforge_project: sinew
|
181
|
-
rubygems_version:
|
159
|
+
rubygems_version: 2.0.6
|
182
160
|
signing_key:
|
183
|
-
specification_version:
|
161
|
+
specification_version: 4
|
184
162
|
summary: Sinew - structured web crawling using recipes.
|
185
|
-
test_files:
|
163
|
+
test_files:
|
164
|
+
- test/helper.rb
|
165
|
+
- test/test.html
|
166
|
+
- test/test_curler.rb
|
167
|
+
- test/test_main.rb
|
168
|
+
- test/test_nokogiri_ext.rb
|
169
|
+
- test/test_text_util.rb
|