sinew 2.0.3 → 4.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/test.yml +26 -0
- data/.gitignore +3 -5
- data/.rubocop.yml +31 -46
- data/Gemfile +9 -0
- data/Gemfile.lock +124 -0
- data/README.md +146 -81
- data/Rakefile +36 -20
- data/bin/sinew +13 -39
- data/lib/sinew.rb +23 -10
- data/lib/sinew/args.rb +53 -0
- data/lib/sinew/base.rb +251 -0
- data/lib/sinew/csv.rb +89 -0
- data/lib/sinew/main.rb +45 -98
- data/lib/sinew/middleware/log_formatter.rb +23 -0
- data/lib/sinew/nokogiri_ext.rb +12 -21
- data/lib/sinew/response.rb +39 -99
- data/lib/sinew/version.rb +1 -1
- data/sample.rb +13 -0
- data/sample.sinew +4 -4
- data/sinew.gemspec +26 -25
- metadata +46 -108
- data/.travis.yml +0 -4
- data/.vscode/extensions.json +0 -3
- data/.vscode/settings.json +0 -15
- data/lib/sinew/cache.rb +0 -79
- data/lib/sinew/core_ext.rb +0 -59
- data/lib/sinew/dsl.rb +0 -114
- data/lib/sinew/output.rb +0 -149
- data/lib/sinew/request.rb +0 -151
- data/lib/sinew/runtime_options.rb +0 -28
- data/test/legacy/eu.httpbin.org/head/redirect,3 +0 -51
- data/test/legacy/eu.httpbin.org/head/status,500 +0 -1
- data/test/legacy/eu.httpbin.org/redirect,3 +0 -11
- data/test/legacy/eu.httpbin.org/status,500 +0 -1
- data/test/legacy/legacy.sinew +0 -2
- data/test/recipes/array_header.sinew +0 -6
- data/test/recipes/basic.sinew +0 -8
- data/test/recipes/dups.sinew +0 -7
- data/test/recipes/implicit_header.sinew +0 -5
- data/test/recipes/limit.sinew +0 -11
- data/test/recipes/noko.sinew +0 -9
- data/test/recipes/uri.sinew +0 -11
- data/test/recipes/xml.sinew +0 -8
- data/test/test.html +0 -45
- data/test/test_cache.rb +0 -69
- data/test/test_helper.rb +0 -123
- data/test/test_legacy.rb +0 -23
- data/test/test_main.rb +0 -34
- data/test/test_nokogiri_ext.rb +0 -18
- data/test/test_output.rb +0 -56
- data/test/test_recipes.rb +0 -60
- data/test/test_requests.rb +0 -135
- data/test/test_utf8.rb +0 -39
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c16f880ce1bf6454b10c34dd1f071daf1a758eebb52e598262d6357cebb2e9f2
|
4
|
+
data.tar.gz: 4e57acae70775805a96fd5e6bd7ed00ebd5d74dbc0e6daa3348fd9161118d00d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 99bf3da2db47a04dbd6f18dfb7aa3f2bc5f706bbe460633f3f8e3589c85377ae14b35d43c750d384fbdeac0883247b5dbab696127700c6fee71731818df57e74
|
7
|
+
data.tar.gz: 1ecda4412fc9f2384bf01aa38ed40327ac5855c37eb3c82ce65d21779a8a582880c6afaffa5d68954759ddccaa04cec9a6f36f8460c96849722ecafe4ac2ba6e
|
@@ -0,0 +1,26 @@
|
|
1
|
+
name: test
|
2
|
+
|
3
|
+
on:
|
4
|
+
push:
|
5
|
+
paths-ignore:
|
6
|
+
- '**.md'
|
7
|
+
pull_request:
|
8
|
+
paths-ignore:
|
9
|
+
- '**.md'
|
10
|
+
workflow_dispatch:
|
11
|
+
|
12
|
+
jobs:
|
13
|
+
test:
|
14
|
+
strategy:
|
15
|
+
max-parallel: 3
|
16
|
+
matrix:
|
17
|
+
os: [ubuntu, macos]
|
18
|
+
ruby-version: [3.0, 2.7]
|
19
|
+
runs-on: ${{ matrix.os }}-latest
|
20
|
+
steps:
|
21
|
+
- uses: actions/checkout@v2
|
22
|
+
- uses: ruby/setup-ruby@v1
|
23
|
+
with:
|
24
|
+
ruby-version: ${{ matrix.ruby-version }}
|
25
|
+
- run: bundle install
|
26
|
+
- run: bundle exec rake test
|
data/.gitignore
CHANGED
data/.rubocop.yml
CHANGED
@@ -1,49 +1,34 @@
|
|
1
1
|
AllCops:
|
2
|
-
|
3
|
-
|
2
|
+
NewCops: enable
|
3
|
+
SuggestExtensions: false
|
4
|
+
TargetRubyVersion: 2.7
|
4
5
|
|
5
|
-
#
|
6
|
-
|
7
|
-
EnforcedStyle: compact
|
8
|
-
Layout/CaseIndentation:
|
9
|
-
EnforcedStyle: end
|
10
|
-
Layout/EndAlignment:
|
11
|
-
EnforcedStyleAlignWith: variable
|
12
|
-
Style/CollectionMethods:
|
13
|
-
Enabled: true
|
14
|
-
PreferredMethods:
|
15
|
-
reduce: inject
|
16
|
-
Style/EmptyMethod:
|
17
|
-
Enabled: false
|
18
|
-
Style/TrailingCommaInArrayLiteral:
|
19
|
-
EnforcedStyleForMultiline: consistent_comma
|
20
|
-
Style/TrailingCommaInHashLiteral:
|
21
|
-
EnforcedStyleForMultiline: consistent_comma
|
6
|
+
# this is buggy in 2.7.0
|
7
|
+
Style/HashTransformValues: { Enabled: false }
|
22
8
|
|
23
|
-
#
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
Style/
|
33
|
-
Style/
|
34
|
-
Style/
|
35
|
-
Style/
|
36
|
-
Style/
|
37
|
-
Style/
|
38
|
-
Style/
|
39
|
-
Style/
|
40
|
-
Style/
|
41
|
-
Style/
|
42
|
-
Style/
|
43
|
-
Style/
|
44
|
-
Style/
|
45
|
-
Style/StderrPuts: { Enabled: false }
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
Enabled: false
|
9
|
+
# minimal personal preference
|
10
|
+
Layout/CaseIndentation: { Enabled: false }
|
11
|
+
Layout/EndAlignment: { EnforcedStyleAlignWith: variable }
|
12
|
+
Lint/AssignmentInCondition: { Enabled: false }
|
13
|
+
Lint/NonLocalExitFromIterator: { Enabled: false }
|
14
|
+
Metrics: { Enabled: false }
|
15
|
+
Naming/HeredocDelimiterNaming: { Enabled: false }
|
16
|
+
Naming/MethodParameterName: { Enabled: false }
|
17
|
+
Naming/VariableNumber: { Enabled: false }
|
18
|
+
Style/AsciiComments: { Enabled: false }
|
19
|
+
Style/ClassVars: { Enabled: false }
|
20
|
+
Style/CommentAnnotation: { Enabled: false }
|
21
|
+
Style/Documentation: { Enabled: false }
|
22
|
+
Style/DoubleNegation: { Enabled: false }
|
23
|
+
Style/EmptyCaseCondition: { Enabled: false }
|
24
|
+
Style/FormatStringToken: { Enabled: false }
|
25
|
+
Style/FrozenStringLiteralComment: { Enabled: false }
|
26
|
+
Style/GuardClause: { Enabled: false }
|
27
|
+
Style/IfUnlessModifier: { Enabled: false }
|
28
|
+
Style/NegatedIf: { Enabled: false }
|
29
|
+
Style/NumericPredicate: { Enabled: false }
|
30
|
+
Style/ParallelAssignment: { Enabled: false }
|
31
|
+
Style/StderrPuts: { Enabled: false }
|
32
|
+
Style/StringConcatenation: { Enabled: false }
|
33
|
+
Style/TrailingCommaInArrayLiteral: { EnforcedStyleForMultiline: consistent_comma }
|
34
|
+
Style/TrailingCommaInHashLiteral: { EnforcedStyleForMultiline: consistent_comma }
|
data/Gemfile
CHANGED
data/Gemfile.lock
ADDED
@@ -0,0 +1,124 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
sinew (4.0.0)
|
5
|
+
amazing_print (~> 1.3)
|
6
|
+
faraday (~> 1.4)
|
7
|
+
faraday-encoding (~> 0)
|
8
|
+
faraday-rate_limiter (~> 0.0)
|
9
|
+
hashie (~> 4.1)
|
10
|
+
httpdisk (~> 0.5)
|
11
|
+
nokogiri (~> 1.11)
|
12
|
+
slop (~> 4.8)
|
13
|
+
sterile (~> 1.0)
|
14
|
+
|
15
|
+
GEM
|
16
|
+
remote: http://rubygems.org/
|
17
|
+
specs:
|
18
|
+
addressable (2.8.0)
|
19
|
+
public_suffix (>= 2.0.2, < 5.0)
|
20
|
+
amazing_print (1.3.0)
|
21
|
+
ast (2.4.2)
|
22
|
+
coderay (1.1.3)
|
23
|
+
content-type (0.0.1)
|
24
|
+
parslet (~> 1.5)
|
25
|
+
crack (0.4.5)
|
26
|
+
rexml
|
27
|
+
domain_name (0.5.20190701)
|
28
|
+
unf (>= 0.0.5, < 1.0.0)
|
29
|
+
faraday (1.5.0)
|
30
|
+
faraday-em_http (~> 1.0)
|
31
|
+
faraday-em_synchrony (~> 1.0)
|
32
|
+
faraday-excon (~> 1.1)
|
33
|
+
faraday-httpclient (~> 1.0.1)
|
34
|
+
faraday-net_http (~> 1.0)
|
35
|
+
faraday-net_http_persistent (~> 1.1)
|
36
|
+
faraday-patron (~> 1.0)
|
37
|
+
multipart-post (>= 1.2, < 3)
|
38
|
+
ruby2_keywords (>= 0.0.4)
|
39
|
+
faraday-cookie_jar (0.0.7)
|
40
|
+
faraday (>= 0.8.0)
|
41
|
+
http-cookie (~> 1.0.0)
|
42
|
+
faraday-em_http (1.0.0)
|
43
|
+
faraday-em_synchrony (1.0.0)
|
44
|
+
faraday-encoding (0.0.5)
|
45
|
+
faraday
|
46
|
+
faraday-excon (1.1.0)
|
47
|
+
faraday-httpclient (1.0.1)
|
48
|
+
faraday-net_http (1.0.1)
|
49
|
+
faraday-net_http_persistent (1.1.0)
|
50
|
+
faraday-patron (1.0.0)
|
51
|
+
faraday-rate_limiter (0.0.4)
|
52
|
+
faraday
|
53
|
+
faraday_middleware (1.0.0)
|
54
|
+
faraday (~> 1.0)
|
55
|
+
hashdiff (1.0.1)
|
56
|
+
hashie (4.1.0)
|
57
|
+
http-cookie (1.0.4)
|
58
|
+
domain_name (~> 0.5)
|
59
|
+
httpdisk (0.5.2)
|
60
|
+
content-type (~> 0.0)
|
61
|
+
faraday (~> 1.4)
|
62
|
+
faraday-cookie_jar (~> 0.0)
|
63
|
+
faraday_middleware (~> 1.0)
|
64
|
+
slop (~> 4.8)
|
65
|
+
method_source (1.0.0)
|
66
|
+
mini_portile2 (2.5.3)
|
67
|
+
minitest (5.14.4)
|
68
|
+
mocha (1.13.0)
|
69
|
+
multipart-post (2.1.1)
|
70
|
+
nokogiri (1.11.7)
|
71
|
+
mini_portile2 (~> 2.5.0)
|
72
|
+
racc (~> 1.4)
|
73
|
+
parallel (1.20.1)
|
74
|
+
parser (3.0.2.0)
|
75
|
+
ast (~> 2.4.1)
|
76
|
+
parslet (1.8.2)
|
77
|
+
pry (0.14.1)
|
78
|
+
coderay (~> 1.1)
|
79
|
+
method_source (~> 1.0)
|
80
|
+
public_suffix (4.0.6)
|
81
|
+
racc (1.5.2)
|
82
|
+
rainbow (3.0.0)
|
83
|
+
rake (13.0.6)
|
84
|
+
regexp_parser (2.1.1)
|
85
|
+
rexml (3.2.5)
|
86
|
+
rubocop (1.18.3)
|
87
|
+
parallel (~> 1.10)
|
88
|
+
parser (>= 3.0.0.0)
|
89
|
+
rainbow (>= 2.2.2, < 4.0)
|
90
|
+
regexp_parser (>= 1.8, < 3.0)
|
91
|
+
rexml
|
92
|
+
rubocop-ast (>= 1.7.0, < 2.0)
|
93
|
+
ruby-progressbar (~> 1.7)
|
94
|
+
unicode-display_width (>= 1.4.0, < 3.0)
|
95
|
+
rubocop-ast (1.7.0)
|
96
|
+
parser (>= 3.0.1.1)
|
97
|
+
ruby-progressbar (1.11.0)
|
98
|
+
ruby2_keywords (0.0.4)
|
99
|
+
slop (4.9.1)
|
100
|
+
sterile (1.0.23)
|
101
|
+
nokogiri (>= 1.11.7)
|
102
|
+
unf (0.1.4)
|
103
|
+
unf_ext
|
104
|
+
unf_ext (0.0.7.7)
|
105
|
+
unicode-display_width (2.0.0)
|
106
|
+
webmock (3.13.0)
|
107
|
+
addressable (>= 2.3.6)
|
108
|
+
crack (>= 0.3.2)
|
109
|
+
hashdiff (>= 0.4.0, < 2.0.0)
|
110
|
+
|
111
|
+
PLATFORMS
|
112
|
+
ruby
|
113
|
+
|
114
|
+
DEPENDENCIES
|
115
|
+
minitest
|
116
|
+
mocha
|
117
|
+
pry
|
118
|
+
rake
|
119
|
+
rubocop (~> 1.18)
|
120
|
+
sinew!
|
121
|
+
webmock
|
122
|
+
|
123
|
+
BUNDLED WITH
|
124
|
+
2.1.4
|
data/README.md
CHANGED
@@ -1,18 +1,23 @@
|
|
1
|
-
![
|
1
|
+
[![Build Status](https://github.com/gurgeous/sinew/workflows/test/badge.svg?branch=master)](https://github.com/gurgeous/sinew/action)
|
2
2
|
|
3
3
|
## Welcome to Sinew
|
4
4
|
|
5
|
-
Sinew
|
5
|
+
Sinew is a Ruby library for collecting data from web sites (scraping). Though small, this project is the culmination of years of effort based on crawling systems built at several different companies. Sinew has been used to crawl millions of websites.
|
6
6
|
|
7
|
-
|
7
|
+
## Key Features
|
8
8
|
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
- Robust crawling with the [Faraday](https://lostisland.github.io/faraday/) HTTP client
|
10
|
+
- Aggressive caching with [httpdisk](https://github.com/gurgeous/httpdisk/)
|
11
|
+
- Easy parsing with HTML cleanup, Nokogiri, JSON, etc.
|
12
|
+
- CSV generation for crawled data
|
12
13
|
|
13
|
-
|
14
|
+
## Installation
|
14
15
|
|
15
16
|
```ruby
|
17
|
+
# install gem
|
18
|
+
$ gem install sinew
|
19
|
+
|
20
|
+
# or add to your Gemfile:
|
16
21
|
gem 'sinew'
|
17
22
|
```
|
18
23
|
|
@@ -20,74 +25,66 @@ gem 'sinew'
|
|
20
25
|
|
21
26
|
<!--- markdown-toc --no-firsth1 --maxdepth 1 readme.md -->
|
22
27
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
28
|
+
- [Sinew 4](#sinew-4-june-2021)
|
29
|
+
- [Quick Example](#quick-example)
|
30
|
+
- [How it Works](#how-it-works)
|
31
|
+
- [Reference](#dsreference)
|
32
|
+
- [Hints](#hints)
|
33
|
+
- [Limitations](#limitations)
|
34
|
+
- [Changelog](#changelog)
|
35
|
+
- [License](#license)
|
31
36
|
|
32
|
-
## Sinew
|
33
|
-
|
34
|
-
I am pleased to announce the release of Sinew 2.0, a complete rewrite of Sinew for the modern era. Enhancements include:
|
35
|
-
|
36
|
-
* Remove dependencies on active_support, curl and tidy. We use HTTParty now.
|
37
|
-
* Much easier to customize requests in `.sinew` files. For example, setting User-Agent or Bearer tokens.
|
38
|
-
* More operations like `post_json` or the generic `http`. These methods are thin wrappers around HTTParty.
|
39
|
-
* New end-of-run report.
|
40
|
-
* Tests, rubocop, vscode settings, travis, etc.
|
37
|
+
## Sinew 4 (June 2021)
|
41
38
|
|
42
39
|
**Breaking change**
|
43
40
|
|
44
|
-
|
41
|
+
We are pleased to announce the release of Sinew 4. The Sinew DSL exposes a single `sinew` method in lieu of the many methods exposed in Sinew 3. Because of this single entry point, Sinew is now much easier to embed in other applications. Also, each Sinew 4 request returns a full Response object to faciliate parallelism.
|
42
|
+
|
43
|
+
Sinew uses the [Faraday](https://lostisland.github.io/faraday/) HTTP client with the [httpdisk](https://github.com/gurgeous/httpdisk/) middleware for aggressive caching of responses.
|
45
44
|
|
46
45
|
## Quick Example
|
47
46
|
|
48
|
-
Here's an example for collecting the links from
|
47
|
+
Here's an example for collecting the links from httpbingo.org. Paste this into a file called `sample.sinew` and run `sinew sample.sinew`. It will create a `sample.csv` file containing the href and text for each link:
|
49
48
|
|
50
49
|
```ruby
|
51
50
|
# get the url
|
52
|
-
get "
|
51
|
+
response = sinew.get "https://httpbingo.org"
|
53
52
|
|
54
53
|
# use nokogiri to collect links
|
55
|
-
noko.css("ul li a").each do |a|
|
54
|
+
response.noko.css("ul li a").each do |a|
|
56
55
|
row = { }
|
57
56
|
row[:url] = a[:href]
|
58
57
|
row[:title] = a.text
|
59
58
|
|
60
59
|
# append a row to the csv
|
61
|
-
csv_emit(row)
|
60
|
+
sinew.csv_emit(row)
|
62
61
|
end
|
63
62
|
```
|
64
63
|
|
65
|
-
If you paste this into a file called `sample.sinew` and run `sinew sample.sinew`, it will create a `sample.csv` file containing the href and text for each link.
|
66
|
-
|
67
64
|
## How it Works
|
68
65
|
|
69
66
|
There are three main features provided by Sinew.
|
70
67
|
|
71
|
-
####
|
68
|
+
#### Recipes
|
72
69
|
|
73
|
-
Sinew uses recipe files to crawl web sites. Recipes have the
|
70
|
+
Sinew uses recipe files to crawl web sites. Recipes have the .sinew extension, but they are plain old Ruby. Here's a trivial example that calls `get` to make an HTTP GET request:
|
74
71
|
|
75
72
|
```ruby
|
76
|
-
get "https://www.google.com/search?q=darwin"
|
77
|
-
get "https://www.google.com/search", q: "charles darwin"
|
73
|
+
response = sinew.get "https://www.google.com/search?q=darwin"
|
74
|
+
response = sinew.get "https://www.google.com/search", q: "charles darwin"
|
78
75
|
```
|
79
76
|
|
80
|
-
Once you've done a `get`, you
|
77
|
+
Once you've done a `get`, you can access the document in a few different formats. In general, it's easiest to use `noko` to automatically parse and interact with HTML results. If Nokogiri isn't appropriate, fall back to regular expressions run against `body` or `html`. Use `json` if you are expecting a JSON response.
|
81
78
|
|
82
79
|
```ruby
|
83
|
-
get "https://www.google.com/search?q=darwin"
|
80
|
+
response = sinew.get "https://www.google.com/search?q=darwin"
|
84
81
|
|
85
82
|
# pull out the links with nokogiri
|
86
|
-
links = noko.css("a").map {
|
83
|
+
links = response.noko.css("a").map { _1[:href] }
|
87
84
|
puts links.inspect
|
88
85
|
|
89
86
|
# or, use a regex
|
90
|
-
links = html[/<a[^>]+href="([^"]+)/, 1]
|
87
|
+
links = response.html[/<a[^>]+href="([^"]+)/, 1]
|
91
88
|
puts links.inspect
|
92
89
|
```
|
93
90
|
|
@@ -96,16 +93,16 @@ puts links.inspect
|
|
96
93
|
Recipes output CSV files. To continue the example above:
|
97
94
|
|
98
95
|
```ruby
|
99
|
-
get "https://www.google.com/search?q=darwin"
|
100
|
-
noko.css("a").each do |i|
|
96
|
+
response = sinew.get "https://www.google.com/search?q=darwin"
|
97
|
+
response.noko.css("a").each do |i|
|
101
98
|
row = { }
|
102
99
|
row[:href] = i[:href]
|
103
100
|
row[:text] = i.text
|
104
|
-
csv_emit row
|
101
|
+
sinew.csv_emit row
|
105
102
|
end
|
106
103
|
```
|
107
104
|
|
108
|
-
Sinew creates a CSV file with the same name as the recipe, and `csv_emit(hash)` appends a row. The values of your hash are converted to strings:
|
105
|
+
Sinew creates a CSV file with the same name as the recipe, and `csv_emit(hash)` appends a row. The values of your hash are cleaned up and converted to strings:
|
109
106
|
|
110
107
|
1. Nokogiri nodes are converted to text
|
111
108
|
1. Arrays are joined with "|", so you can separate them later
|
@@ -114,82 +111,150 @@ Sinew creates a CSV file with the same name as the recipe, and `csv_emit(hash)`
|
|
114
111
|
|
115
112
|
#### Caching
|
116
113
|
|
117
|
-
|
114
|
+
Sinew uses [httpdisk](https://github.com/gurgeous/httpdisk/) to aggressively cache all HTTP responses to disk in `~/.sinew`. Error responses are cached as well. Each URL will be hit exactly once, and requests are rate limited to one per second. Sinew tries to be polite.
|
118
115
|
|
119
|
-
|
116
|
+
Sinew never deletes files from the cache - that's up to you! Sinew has various command line options to refresh the cache. See `--expires`, `--force` and `--force-errors`.
|
120
117
|
|
121
|
-
Because all requests are cached, you can run Sinew repeatedly with confidence. Run it over and over again while you
|
118
|
+
Because all requests are cached, you can run Sinew repeatedly with confidence. Run it over and over again while you work on your recipe.
|
122
119
|
|
123
|
-
##
|
120
|
+
## Running Sinew
|
124
121
|
|
125
|
-
|
122
|
+
The `sinew` command line has many useful options. You will be using this command many times as you iterate on your recipe:
|
126
123
|
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
124
|
+
```sh
|
125
|
+
$ bin/sinew --help
|
126
|
+
Usage: sinew [options] [recipe]
|
127
|
+
-l, --limit quit after emitting this many rows
|
128
|
+
--proxy use host[:port] as HTTP proxy
|
129
|
+
--timeout maximum time allowed for the transfer
|
130
|
+
-s, --silent suppress some output
|
131
|
+
-v, --verbose dump emitted rows while running
|
132
|
+
From httpdisk:
|
133
|
+
--dir set custom cache directory
|
134
|
+
--expires when to expire cached requests (ex: 1h, 2d, 3w)
|
135
|
+
--force don't read anything from cache (but still write)
|
136
|
+
--force-errors don't read errors from cache (but still write)
|
137
|
+
```
|
138
|
+
|
139
|
+
`Sinew` also has many runtime options that can be set by in your recipe. For example:
|
140
|
+
|
141
|
+
```ruby
|
142
|
+
sinew.options[:headers] = { 'User-Agent' => 'xyz' }
|
143
|
+
|
144
|
+
...
|
145
|
+
```
|
146
|
+
|
147
|
+
Here is the list of available options for `Sinew`:
|
148
|
+
|
149
|
+
- **headers** - default HTTP headers to use on every request
|
150
|
+
- **ignore_params** - ignore these query params when generating httpdisk cache keys
|
151
|
+
- **insecure** - ignore SSL errors
|
152
|
+
- **params** - default query parameters to use on every request
|
153
|
+
- **rate_limit** - minimum time between network requests
|
154
|
+
- **retries** - number of times to retry each failed request
|
155
|
+
- **url_prefix** - deafult URL base to use on every request
|
156
|
+
|
157
|
+
## Reference
|
158
|
+
|
159
|
+
#### Making HTTP requests
|
160
|
+
|
161
|
+
- `sinew.get(url, params = nil, headers = nil)` - fetch a url with GET
|
162
|
+
- `sinew.post(url, body = nil, headers = nil)` - fetch a url with POST, using `form` as the URL encoded POST body.
|
163
|
+
- `sinew.post_json(url, body = nil, headers = nil)` - fetch a url with POST, using `json` as the POST body.
|
131
164
|
|
132
165
|
#### Parsing the response
|
133
166
|
|
134
|
-
|
167
|
+
Each request method returns a `Sinew::Response`. The response has several helpers to make parsing easier:
|
135
168
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
169
|
+
- `body` - the raw body
|
170
|
+
- `html` - like `body`, but with a handful of HTML-specific whitespace cleanups
|
171
|
+
- `noko` - parse as HTML and return a [Nokogiri](http://nokogiri.org) document
|
172
|
+
- `xml` - parse as XML and return a [Nokogiri](http://nokogiri.org) document
|
173
|
+
- `json` - parse as JSON, with symbolized keys
|
174
|
+
- `mash` - parse as JSON and return a [Hashie::Mash](https://github.com/hashie/hashie#mash)
|
175
|
+
- `url` - the url of the request. If the request goes through a redirect, `url` will reflect the final url.
|
143
176
|
|
144
177
|
#### Writing CSV
|
145
178
|
|
146
|
-
|
147
|
-
|
179
|
+
- `sinew.csv_header(columns)` - specify the columns for CSV output. If you don't call this, Sinew will use the keys from the first call to `sinew.csv_emit`.
|
180
|
+
- `sinew.csv_emit(hash)` - append a row to the CSV file
|
181
|
+
|
182
|
+
#### Advanced: Cache
|
183
|
+
|
184
|
+
Sinew has some advanced helpers for checking the httpdisk cache. For the following methods, `body` hashes default to form body type.
|
185
|
+
|
186
|
+
- `sinew.cached?(method, url, params = nil, body = nil)` - check if request is cached
|
187
|
+
- `sinew.uncache(method, url, params = nil, body = nil)` - remove cache file, if any
|
188
|
+
- `sinew.status(method, url, params = nil, body = nil)` - get httpdisk status
|
189
|
+
|
190
|
+
Plus some caching helpers in Sinew::Response:
|
191
|
+
|
192
|
+
- `diskpath` - the location on disk for the cached httpdisk response
|
193
|
+
- `uncache` - remove cache file for this response
|
148
194
|
|
149
195
|
## Hints
|
150
196
|
|
151
197
|
Writing Sinew recipes is fun and easy. The builtin caching means you can iterate quickly, since you won't have to re-fetch the data. Here are some hints for writing idiomatic recipes:
|
152
198
|
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
199
|
+
- Sinew doesn't (yet) check robots.txt - please check it manually.
|
200
|
+
- Prefer Nokogiri over regular expressions wherever possible. Learn [CSS selectors](http://www.w3schools.com/cssref/css_selectors.asp).
|
201
|
+
- In Chrome, `$` in the console is your friend.
|
202
|
+
- Fallback to regular expressions if you're desperate. Depending on the site, use either `body` or `html`. `html` is probably your best bet. `body` is good for crawling Javascript, but it's fragile if the site changes.
|
203
|
+
- Learn to love `String#[regexp]`, which is an obscure operator but incredibly handy for Sinew.
|
204
|
+
- Laziness is useful. Keep your CSS selectors and regular expressions simple, so maybe they'll work again the next time you need to crawl a site.
|
205
|
+
- Don't be afraid to mix CSS selectors, regular expressions, and Ruby:
|
160
206
|
|
161
207
|
```ruby
|
162
|
-
noko.css("table")[4].css("td").select
|
208
|
+
noko.css("table")[4].css("td").select do
|
209
|
+
_1[:width].to_i > 80
|
210
|
+
end.map(&:text)
|
163
211
|
```
|
164
212
|
|
165
|
-
|
166
|
-
|
167
|
-
|
213
|
+
- Debug your recipes using plain old `puts`, or better yet use `ap` from [amazing_print](https://github.com/amazing-print/amazing_print).
|
214
|
+
- Run `sinew -v` to get a report on every `csv_emit`. Very handy.
|
215
|
+
- Add the CSV files to your git repo. That way you can version them and get diffs!
|
168
216
|
|
169
217
|
## Limitations
|
170
218
|
|
171
|
-
|
172
|
-
|
219
|
+
- Caching is based on URL, so use caution with cookies and other forms of authentication
|
220
|
+
- Almost no support for international (non-english) characters
|
173
221
|
|
174
222
|
## Changelog
|
175
223
|
|
224
|
+
#### 4.0.0 (July 2021)
|
225
|
+
|
226
|
+
- Rewritten to use simpler DSL
|
227
|
+
- Upgraded to httpdisk 0.5 to take advantage of the new encoding support
|
228
|
+
|
229
|
+
#### 3.0.0 (May 2021)
|
230
|
+
|
231
|
+
- Major rewrite of network and caching layer. See above.
|
232
|
+
- Use Faraday HTTP client with sinew middleware for caching.
|
233
|
+
- Supports multiple proxies (`--proxy host1,host2,...`)
|
234
|
+
|
235
|
+
#### 2.0.4 (May 2018)
|
236
|
+
|
237
|
+
- Handle and cache more errors (too many redirects, connection failures, etc.)
|
238
|
+
- Support for adding uri.scheme in generate_cache_key
|
239
|
+
- Added status `code`, a peer to `uri`, `raw`, etc.
|
240
|
+
|
176
241
|
#### 2.0.3 (May 2018)
|
177
242
|
|
178
|
-
|
243
|
+
- & now normalizes to & (not and)
|
179
244
|
|
180
245
|
#### 2.0.2 (May 2018)
|
181
246
|
|
182
|
-
|
183
|
-
|
184
|
-
|
247
|
+
- Support for `--limit`, `--proxy` and the `xml` variable
|
248
|
+
- Dedup - warn and ignore if row[:url] has already been emitted
|
249
|
+
- Auto gunzip if contents are compressed
|
185
250
|
|
186
251
|
#### 2.0.1 (May 2018)
|
187
252
|
|
188
|
-
|
253
|
+
- Support for legacy cached `head` files from Sinew 1
|
189
254
|
|
190
255
|
#### 2.0.0 (May 2018)
|
191
256
|
|
192
|
-
|
257
|
+
- Complete rewrite. See above.
|
193
258
|
|
194
259
|
#### 1.0.3 (June 2012)
|
195
260
|
|