sinew 3.0.1 → 4.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/test.yml +4 -8
- data/.gitignore +3 -5
- data/.rubocop.yml +13 -48
- data/Gemfile +9 -9
- data/Gemfile.lock +132 -0
- data/LICENSE +1 -1
- data/README.md +113 -48
- data/Rakefile +3 -51
- data/bin/sinew +13 -41
- data/justfile +59 -0
- data/lib/sinew/args.rb +53 -0
- data/lib/sinew/base.rb +252 -0
- data/lib/sinew/csv.rb +89 -0
- data/lib/sinew/main.rb +46 -72
- data/lib/sinew/{connection → middleware}/log_formatter.rb +3 -2
- data/lib/sinew/nokogiri_ext.rb +13 -22
- data/lib/sinew/response.rb +41 -52
- data/lib/sinew/version.rb +1 -1
- data/lib/sinew.rb +23 -9
- data/sample.rb +13 -0
- data/sample.sinew +4 -4
- data/sinew.gemspec +24 -20
- metadata +56 -31
- data/.vscode/extensions.json +0 -3
- data/.vscode/settings.json +0 -5
- data/lib/sinew/connection/rate_limit.rb +0 -29
- data/lib/sinew/connection.rb +0 -52
- data/lib/sinew/core_ext.rb +0 -59
- data/lib/sinew/dsl.rb +0 -115
- data/lib/sinew/output.rb +0 -133
- data/lib/sinew/request.rb +0 -86
- data/lib/sinew/runtime_options.rb +0 -28
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 30b6d7da43f53885bd23a2d283ec2dc34f0f38640d4bac3c5f3275deed4f8e84
|
4
|
+
data.tar.gz: dfb86670352efb63ddad965418b619ab74edf8738a7ae55ed4c8ea521c2f625c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 230c4c81b27dd5087bb44ac3b89b29bed65222d81327c4dd8580c5762e50cb1d54d5521d5d3349f7d40a228244e9fc436d502fa22e879909ce2b983b9d1cee0f
|
7
|
+
data.tar.gz: ce76f649971d961bd22de2f8b71f1226838b9e8385dd269fe2e695e93164e850b16fff3a6c4c42b5fb961beff9a6675deb6fa5e66060053274a464d878cd0d35
|
data/.github/workflows/test.yml
CHANGED
@@ -2,11 +2,7 @@ name: test
|
|
2
2
|
|
3
3
|
on:
|
4
4
|
push:
|
5
|
-
paths-ignore:
|
6
|
-
- '**.md'
|
7
5
|
pull_request:
|
8
|
-
paths-ignore:
|
9
|
-
- '**.md'
|
10
6
|
workflow_dispatch:
|
11
7
|
|
12
8
|
jobs:
|
@@ -15,12 +11,12 @@ jobs:
|
|
15
11
|
max-parallel: 3
|
16
12
|
matrix:
|
17
13
|
os: [ubuntu, macos]
|
18
|
-
ruby-version: [3.
|
14
|
+
ruby-version: [head, 3.2, 3.1]
|
19
15
|
runs-on: ${{ matrix.os }}-latest
|
20
16
|
steps:
|
21
|
-
- uses: actions/checkout@
|
17
|
+
- uses: actions/checkout@v3
|
18
|
+
- uses: taiki-e/install-action@just
|
22
19
|
- uses: ruby/setup-ruby@v1
|
23
20
|
with:
|
24
21
|
ruby-version: ${{ matrix.ruby-version }}
|
25
|
-
- run:
|
26
|
-
- run: bundle exec rake test
|
22
|
+
- run: just ci
|
data/.gitignore
CHANGED
data/.rubocop.yml
CHANGED
@@ -1,52 +1,17 @@
|
|
1
|
+
require:
|
2
|
+
- standard
|
3
|
+
|
4
|
+
inherit_gem:
|
5
|
+
standard: config/base.yml
|
6
|
+
|
1
7
|
AllCops:
|
2
|
-
TargetRubyVersion: 2.7
|
3
8
|
NewCops: enable
|
9
|
+
SuggestExtensions: false
|
10
|
+
TargetRubyVersion: 3.1
|
4
11
|
|
5
|
-
#
|
6
|
-
|
7
|
-
|
8
|
-
Layout/CaseIndentation:
|
9
|
-
EnforcedStyle: end
|
10
|
-
Layout/EndAlignment:
|
11
|
-
EnforcedStyleAlignWith: variable
|
12
|
-
Style/CollectionMethods:
|
13
|
-
Enabled: true
|
14
|
-
PreferredMethods:
|
15
|
-
reduce: inject
|
16
|
-
Style/EmptyMethod:
|
17
|
-
Enabled: false
|
18
|
-
Style/TrailingCommaInArrayLiteral:
|
19
|
-
EnforcedStyleForMultiline: consistent_comma
|
20
|
-
Style/TrailingCommaInHashLiteral:
|
21
|
-
EnforcedStyleForMultiline: consistent_comma
|
22
|
-
|
23
|
-
# amd: these seem extreme
|
24
|
-
Lint/AssignmentInCondition: { Enabled: false } # I do this all the time
|
25
|
-
Lint/SuppressedException: { Enabled: false } # blank rescues are useful
|
26
|
-
Naming/BinaryOperatorParameterName: { Enabled: false } # silly
|
27
|
-
Naming/HeredocDelimiterNaming: { Enabled: false } # silly
|
28
|
-
Naming/MethodParameterName: { Enabled: false } # silly
|
29
|
-
Style/AccessorGrouping: { Enabled: false } # silly
|
30
|
-
Style/AsciiComments: { Enabled: false } # silly
|
31
|
-
Style/ClassAndModuleChildren: { Enabled: false } # silly
|
32
|
-
Style/Documentation: { Enabled: false } # we don't need this
|
33
|
-
Style/DoubleNegation: { Enabled: false } # silly
|
34
|
-
Style/FormatStringToken: { Enabled: false } # we like printf here
|
35
|
-
Style/FrozenStringLiteralComment: { Enabled: false } # seems excessive
|
36
|
-
Style/GuardClause: { Enabled: false } # confusing
|
37
|
-
Style/HashTransformValues: { Enabled: false } # breaks code by trying to apply to an array
|
38
|
-
Style/IfUnlessModifier: { Enabled: false } # personally I hate unless
|
39
|
-
Style/NegatedIf: { Enabled: false } # these are fine
|
40
|
-
Style/Next: { Enabled: false } # these are fine
|
41
|
-
Style/NumericPredicate: { Enabled: false } # silly
|
42
|
-
Style/ParallelAssignment: { Enabled: false } # these are fine
|
43
|
-
Style/PerlBackrefs: { Enabled: false } # these are fine
|
44
|
-
Style/RaiseArgs: { Enabled: false } # silly
|
45
|
-
Style/RedundantAssignment: { Enabled: false } # these are usually on purpose
|
46
|
-
Style/RegexpLiteral: { Enabled: false } # these are fine
|
47
|
-
Style/SoleNestedConditional: { Enabled: false } # these are fine
|
48
|
-
Style/StderrPuts: { Enabled: false } # this is awful
|
12
|
+
#
|
13
|
+
# fight with standardrb!
|
14
|
+
#
|
49
15
|
|
50
|
-
|
51
|
-
|
52
|
-
Enabled: false
|
16
|
+
Style/RedundantReturn: { Enabled: false }
|
17
|
+
Style/HashSyntax: { EnforcedShorthandSyntax: always }
|
data/Gemfile
CHANGED
@@ -1,11 +1,11 @@
|
|
1
|
-
source
|
1
|
+
source "http://rubygems.org"
|
2
|
+
gemspec
|
2
3
|
|
3
|
-
group :development do
|
4
|
-
gem
|
5
|
-
gem
|
6
|
-
gem
|
7
|
-
gem
|
8
|
-
gem
|
4
|
+
group :development, :test do
|
5
|
+
gem "minitest"
|
6
|
+
gem "mocha"
|
7
|
+
gem "pry"
|
8
|
+
gem "rake"
|
9
|
+
gem "standard", require: false
|
10
|
+
gem "webmock"
|
9
11
|
end
|
10
|
-
|
11
|
-
gemspec
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,132 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
sinew (4.0.1)
|
5
|
+
amazing_print (~> 1.5)
|
6
|
+
faraday (~> 2.7)
|
7
|
+
faraday-encoding (~> 0.0)
|
8
|
+
faraday-rate_limiter (~> 0.0)
|
9
|
+
faraday-retry (~> 2.0)
|
10
|
+
hashie (~> 5.0)
|
11
|
+
httpdisk (~> 1.0)
|
12
|
+
nokogiri (~> 1.15)
|
13
|
+
slop (~> 4.10)
|
14
|
+
sterile (~> 1.0)
|
15
|
+
|
16
|
+
GEM
|
17
|
+
remote: http://rubygems.org/
|
18
|
+
specs:
|
19
|
+
addressable (2.8.5)
|
20
|
+
public_suffix (>= 2.0.2, < 6.0)
|
21
|
+
amazing_print (1.5.0)
|
22
|
+
ast (2.4.2)
|
23
|
+
coderay (1.1.3)
|
24
|
+
content-type (0.0.2)
|
25
|
+
parslet (~> 2.0)
|
26
|
+
crack (0.4.5)
|
27
|
+
rexml
|
28
|
+
domain_name (0.5.20190701)
|
29
|
+
unf (>= 0.0.5, < 1.0.0)
|
30
|
+
faraday (2.7.10)
|
31
|
+
faraday-net_http (>= 2.0, < 3.1)
|
32
|
+
ruby2_keywords (>= 0.0.4)
|
33
|
+
faraday-cookie_jar (0.0.7)
|
34
|
+
faraday (>= 0.8.0)
|
35
|
+
http-cookie (~> 1.0.0)
|
36
|
+
faraday-encoding (0.0.5)
|
37
|
+
faraday
|
38
|
+
faraday-follow_redirects (0.3.0)
|
39
|
+
faraday (>= 1, < 3)
|
40
|
+
faraday-net_http (3.0.2)
|
41
|
+
faraday-rate_limiter (0.0.4)
|
42
|
+
faraday
|
43
|
+
faraday-retry (2.0.0)
|
44
|
+
faraday (~> 2.0)
|
45
|
+
hashdiff (1.0.1)
|
46
|
+
hashie (5.0.0)
|
47
|
+
http-cookie (1.0.5)
|
48
|
+
domain_name (~> 0.5)
|
49
|
+
httpdisk (1.0.0)
|
50
|
+
content-type (~> 0.0)
|
51
|
+
faraday (~> 2.7)
|
52
|
+
faraday-cookie_jar (~> 0.0)
|
53
|
+
faraday-follow_redirects (~> 0.0)
|
54
|
+
slop (~> 4.10)
|
55
|
+
json (2.6.3)
|
56
|
+
language_server-protocol (3.17.0.3)
|
57
|
+
lint_roller (1.1.0)
|
58
|
+
method_source (1.0.0)
|
59
|
+
minitest (5.19.0)
|
60
|
+
mocha (2.1.0)
|
61
|
+
ruby2_keywords (>= 0.0.5)
|
62
|
+
nokogiri (1.15.4-arm64-darwin)
|
63
|
+
racc (~> 1.4)
|
64
|
+
parallel (1.23.0)
|
65
|
+
parser (3.2.2.3)
|
66
|
+
ast (~> 2.4.1)
|
67
|
+
racc
|
68
|
+
parslet (2.0.0)
|
69
|
+
pry (0.14.2)
|
70
|
+
coderay (~> 1.1)
|
71
|
+
method_source (~> 1.0)
|
72
|
+
public_suffix (5.0.3)
|
73
|
+
racc (1.7.1)
|
74
|
+
rainbow (3.1.1)
|
75
|
+
rake (13.0.6)
|
76
|
+
regexp_parser (2.8.1)
|
77
|
+
rexml (3.2.6)
|
78
|
+
rubocop (1.52.1)
|
79
|
+
json (~> 2.3)
|
80
|
+
parallel (~> 1.10)
|
81
|
+
parser (>= 3.2.2.3)
|
82
|
+
rainbow (>= 2.2.2, < 4.0)
|
83
|
+
regexp_parser (>= 1.8, < 3.0)
|
84
|
+
rexml (>= 3.2.5, < 4.0)
|
85
|
+
rubocop-ast (>= 1.28.0, < 2.0)
|
86
|
+
ruby-progressbar (~> 1.7)
|
87
|
+
unicode-display_width (>= 2.4.0, < 3.0)
|
88
|
+
rubocop-ast (1.29.0)
|
89
|
+
parser (>= 3.2.1.0)
|
90
|
+
rubocop-performance (1.18.0)
|
91
|
+
rubocop (>= 1.7.0, < 2.0)
|
92
|
+
rubocop-ast (>= 0.4.0)
|
93
|
+
ruby-progressbar (1.13.0)
|
94
|
+
ruby2_keywords (0.0.5)
|
95
|
+
slop (4.10.1)
|
96
|
+
standard (1.30.1)
|
97
|
+
language_server-protocol (~> 3.17.0.2)
|
98
|
+
lint_roller (~> 1.0)
|
99
|
+
rubocop (~> 1.52.0)
|
100
|
+
standard-custom (~> 1.0.0)
|
101
|
+
standard-performance (~> 1.1.0)
|
102
|
+
standard-custom (1.0.2)
|
103
|
+
lint_roller (~> 1.0)
|
104
|
+
rubocop (~> 1.50)
|
105
|
+
standard-performance (1.1.2)
|
106
|
+
lint_roller (~> 1.1)
|
107
|
+
rubocop-performance (~> 1.18.0)
|
108
|
+
sterile (1.0.25)
|
109
|
+
nokogiri (>= 1.11.7)
|
110
|
+
unf (0.1.4)
|
111
|
+
unf_ext
|
112
|
+
unf_ext (0.0.8.2)
|
113
|
+
unicode-display_width (2.4.2)
|
114
|
+
webmock (3.18.1)
|
115
|
+
addressable (>= 2.8.0)
|
116
|
+
crack (>= 0.3.2)
|
117
|
+
hashdiff (>= 0.4.0, < 2.0.0)
|
118
|
+
|
119
|
+
PLATFORMS
|
120
|
+
arm64-darwin-20
|
121
|
+
|
122
|
+
DEPENDENCIES
|
123
|
+
minitest
|
124
|
+
mocha
|
125
|
+
pry
|
126
|
+
rake
|
127
|
+
sinew!
|
128
|
+
standard
|
129
|
+
webmock
|
130
|
+
|
131
|
+
BUNDLED WITH
|
132
|
+
2.4.19
|
data/LICENSE
CHANGED
data/README.md
CHANGED
@@ -1,18 +1,23 @@
|
|
1
|
-
[![Build Status](https://github.com/gurgeous/sinew/workflows/test/badge.svg?branch=
|
1
|
+
[![Build Status](https://github.com/gurgeous/sinew/workflows/test/badge.svg?branch=main)](https://github.com/gurgeous/sinew/action)
|
2
2
|
|
3
3
|
## Welcome to Sinew
|
4
4
|
|
5
|
-
Sinew
|
5
|
+
Sinew is a Ruby library for collecting data from web sites (scraping). Though small, this project is the culmination of years of effort based on crawling systems built at several different companies. Sinew has been used to crawl millions of websites.
|
6
6
|
|
7
|
-
|
7
|
+
## Key Features
|
8
8
|
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
- Robust crawling with the [Faraday](https://lostisland.github.io/faraday/) HTTP client
|
10
|
+
- Aggressive caching with [httpdisk](https://github.com/gurgeous/httpdisk/)
|
11
|
+
- Easy parsing with HTML cleanup, Nokogiri, JSON, etc.
|
12
|
+
- CSV generation for crawled data
|
12
13
|
|
13
|
-
|
14
|
+
## Installation
|
14
15
|
|
15
16
|
```ruby
|
17
|
+
# install gem
|
18
|
+
$ gem install sinew
|
19
|
+
|
20
|
+
# or add to your Gemfile:
|
16
21
|
gem 'sinew'
|
17
22
|
```
|
18
23
|
|
@@ -20,22 +25,22 @@ gem 'sinew'
|
|
20
25
|
|
21
26
|
<!--- markdown-toc --no-firsth1 --maxdepth 1 readme.md -->
|
22
27
|
|
23
|
-
- [Sinew
|
28
|
+
- [Sinew 4](#sinew-4-june-2021)
|
24
29
|
- [Quick Example](#quick-example)
|
25
30
|
- [How it Works](#how-it-works)
|
26
|
-
- [
|
31
|
+
- [Reference](#dsreference)
|
27
32
|
- [Hints](#hints)
|
28
33
|
- [Limitations](#limitations)
|
29
34
|
- [Changelog](#changelog)
|
30
35
|
- [License](#license)
|
31
36
|
|
32
|
-
## Sinew
|
33
|
-
|
34
|
-
I am pleased to announce the release of Sinew 3.0. Sinew has been streamlined and updated to use the [Faraday](https://lostisland.github.io/faraday/) HTTP client with [sinew](https://github.com/gurgeous/sinew/) middleware for caching.
|
37
|
+
## Sinew 4 (June 2021)
|
35
38
|
|
36
39
|
**Breaking change**
|
37
40
|
|
38
|
-
Sinew
|
41
|
+
We are pleased to announce the release of Sinew 4. The Sinew DSL exposes a single `sinew` method in lieu of the many methods exposed in Sinew 3. Because of this single entry point, Sinew is now much easier to embed in other applications. Also, each Sinew 4 request returns a full Response object to faciliate parallelism.
|
42
|
+
|
43
|
+
Sinew uses the [Faraday](https://lostisland.github.io/faraday/) HTTP client with the [httpdisk](https://github.com/gurgeous/httpdisk/) middleware for aggressive caching of responses.
|
39
44
|
|
40
45
|
## Quick Example
|
41
46
|
|
@@ -43,16 +48,16 @@ Here's an example for collecting the links from httpbingo.org. Paste this into a
|
|
43
48
|
|
44
49
|
```ruby
|
45
50
|
# get the url
|
46
|
-
get "
|
51
|
+
response = sinew.get "https://httpbingo.org"
|
47
52
|
|
48
53
|
# use nokogiri to collect links
|
49
|
-
noko.css("ul li a").each do |a|
|
54
|
+
response.noko.css("ul li a").each do |a|
|
50
55
|
row = { }
|
51
56
|
row[:url] = a[:href]
|
52
57
|
row[:title] = a.text
|
53
58
|
|
54
59
|
# append a row to the csv
|
55
|
-
csv_emit(row)
|
60
|
+
sinew.csv_emit(row)
|
56
61
|
end
|
57
62
|
```
|
58
63
|
|
@@ -60,26 +65,26 @@ end
|
|
60
65
|
|
61
66
|
There are three main features provided by Sinew.
|
62
67
|
|
63
|
-
####
|
68
|
+
#### Recipes
|
64
69
|
|
65
|
-
Sinew uses recipe files to crawl web sites. Recipes have the
|
70
|
+
Sinew uses recipe files to crawl web sites. Recipes have the .sinew extension, but they are plain old Ruby. Here's a trivial example that calls `get` to make an HTTP GET request:
|
66
71
|
|
67
72
|
```ruby
|
68
|
-
get "https://www.google.com/search?q=darwin"
|
69
|
-
get "https://www.google.com/search", q: "charles darwin"
|
73
|
+
response = sinew.get "https://www.google.com/search?q=darwin"
|
74
|
+
response = sinew.get "https://www.google.com/search", q: "charles darwin"
|
70
75
|
```
|
71
76
|
|
72
|
-
Once you've done a `get`, you
|
77
|
+
Once you've done a `get`, you can access the document in a few different formats. In general, it's easiest to use `noko` to automatically parse and interact with HTML results. If Nokogiri isn't appropriate, fall back to regular expressions run against `body` or `html`. Use `json` if you are expecting a JSON response.
|
73
78
|
|
74
79
|
```ruby
|
75
|
-
get "https://www.google.com/search?q=darwin"
|
80
|
+
response = sinew.get "https://www.google.com/search?q=darwin"
|
76
81
|
|
77
82
|
# pull out the links with nokogiri
|
78
|
-
links = noko.css("a").map {
|
83
|
+
links = response.noko.css("a").map { _1[:href] }
|
79
84
|
puts links.inspect
|
80
85
|
|
81
86
|
# or, use a regex
|
82
|
-
links = html[/<a[^>]+href="([^"]+)/, 1]
|
87
|
+
links = response.html[/<a[^>]+href="([^"]+)/, 1]
|
83
88
|
puts links.inspect
|
84
89
|
```
|
85
90
|
|
@@ -88,16 +93,16 @@ puts links.inspect
|
|
88
93
|
Recipes output CSV files. To continue the example above:
|
89
94
|
|
90
95
|
```ruby
|
91
|
-
get "https://www.google.com/search?q=darwin"
|
92
|
-
noko.css("a").each do |i|
|
96
|
+
response = sinew.get "https://www.google.com/search?q=darwin"
|
97
|
+
response.noko.css("a").each do |i|
|
93
98
|
row = { }
|
94
99
|
row[:href] = i[:href]
|
95
100
|
row[:text] = i.text
|
96
|
-
csv_emit row
|
101
|
+
sinew.csv_emit row
|
97
102
|
end
|
98
103
|
```
|
99
104
|
|
100
|
-
Sinew creates a CSV file with the same name as the recipe, and `csv_emit(hash)` appends a row. The values of your hash are converted to strings:
|
105
|
+
Sinew creates a CSV file with the same name as the recipe, and `csv_emit(hash)` appends a row. The values of your hash are cleaned up and converted to strings:
|
101
106
|
|
102
107
|
1. Nokogiri nodes are converted to text
|
103
108
|
1. Arrays are joined with "|", so you can separate them later
|
@@ -108,35 +113,84 @@ Sinew creates a CSV file with the same name as the recipe, and `csv_emit(hash)`
|
|
108
113
|
|
109
114
|
Sinew uses [httpdisk](https://github.com/gurgeous/httpdisk/) to aggressively cache all HTTP responses to disk in `~/.sinew`. Error responses are cached as well. Each URL will be hit exactly once, and requests are rate limited to one per second. Sinew tries to be polite.
|
110
115
|
|
111
|
-
Sinew never deletes files from the cache - that's up to you!
|
116
|
+
Sinew never deletes files from the cache - that's up to you! Sinew has various command line options to refresh the cache. See `--expires`, `--force` and `--force-errors`.
|
112
117
|
|
113
|
-
Because all requests are cached, you can run Sinew repeatedly with confidence. Run it over and over again while you
|
118
|
+
Because all requests are cached, you can run Sinew repeatedly with confidence. Run it over and over again while you work on your recipe.
|
114
119
|
|
115
|
-
##
|
120
|
+
## Running Sinew
|
116
121
|
|
117
|
-
|
122
|
+
The `sinew` command line has many useful options. You will be using this command many times as you iterate on your recipe:
|
118
123
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
-
|
124
|
+
```sh
|
125
|
+
$ bin/sinew --help
|
126
|
+
Usage: sinew [options] [recipe]
|
127
|
+
-l, --limit quit after emitting this many rows
|
128
|
+
--proxy use host[:port] as HTTP proxy
|
129
|
+
--timeout maximum time allowed for the transfer
|
130
|
+
-s, --silent suppress some output
|
131
|
+
-v, --verbose dump emitted rows while running
|
132
|
+
From httpdisk:
|
133
|
+
--dir set custom cache directory
|
134
|
+
--expires when to expire cached requests (ex: 1h, 2d, 3w)
|
135
|
+
--force don't read anything from cache (but still write)
|
136
|
+
--force-errors don't read errors from cache (but still write)
|
137
|
+
```
|
138
|
+
|
139
|
+
`Sinew` also has many runtime options that can be set by in your recipe. For example:
|
140
|
+
|
141
|
+
```ruby
|
142
|
+
sinew.options[:headers] = { 'User-Agent' => 'xyz' }
|
143
|
+
|
144
|
+
...
|
145
|
+
```
|
146
|
+
|
147
|
+
Here is the list of available options for `Sinew`:
|
148
|
+
|
149
|
+
- **headers** - default HTTP headers to use on every request
|
150
|
+
- **ignore_params** - ignore these query params when generating httpdisk cache keys
|
151
|
+
- **insecure** - ignore SSL errors
|
152
|
+
- **params** - default query parameters to use on every request
|
153
|
+
- **rate_limit** - minimum time between network requests
|
154
|
+
- **retries** - number of times to retry each failed request
|
155
|
+
- **url_prefix** - deafult URL base to use on every request
|
156
|
+
|
157
|
+
## Reference
|
158
|
+
|
159
|
+
#### Making HTTP requests
|
160
|
+
|
161
|
+
- `sinew.get(url, params = nil, headers = nil)` - fetch a url with GET
|
162
|
+
- `sinew.post(url, body = nil, headers = nil)` - fetch a url with POST, using `form` as the URL encoded POST body.
|
163
|
+
- `sinew.post_json(url, body = nil, headers = nil)` - fetch a url with POST, using `json` as the POST body.
|
123
164
|
|
124
165
|
#### Parsing the response
|
125
166
|
|
126
|
-
|
167
|
+
Each request method returns a `Sinew::Response`. The response has several helpers to make parsing easier:
|
127
168
|
|
128
|
-
- `
|
129
|
-
- `html` - like `
|
130
|
-
- `noko` - parse
|
131
|
-
- `xml` - parse
|
132
|
-
- `json` - parse
|
133
|
-
- `
|
134
|
-
- `
|
169
|
+
- `body` - the raw body
|
170
|
+
- `html` - like `body`, but with a handful of HTML-specific whitespace cleanups
|
171
|
+
- `noko` - parse as HTML and return a [Nokogiri](http://nokogiri.org) document
|
172
|
+
- `xml` - parse as XML and return a [Nokogiri](http://nokogiri.org) document
|
173
|
+
- `json` - parse as JSON, with symbolized keys
|
174
|
+
- `mash` - parse as JSON and return a [Hashie::Mash](https://github.com/hashie/hashie#mash)
|
175
|
+
- `url` - the url of the request. If the request goes through a redirect, `url` will reflect the final url.
|
135
176
|
|
136
177
|
#### Writing CSV
|
137
178
|
|
138
|
-
- `csv_header(
|
139
|
-
- `csv_emit(hash)` - append a row to the CSV file
|
179
|
+
- `sinew.csv_header(columns)` - specify the columns for CSV output. If you don't call this, Sinew will use the keys from the first call to `sinew.csv_emit`.
|
180
|
+
- `sinew.csv_emit(hash)` - append a row to the CSV file
|
181
|
+
|
182
|
+
#### Advanced: Cache
|
183
|
+
|
184
|
+
Sinew has some advanced helpers for checking the httpdisk cache. For the following methods, `body` hashes default to form body type.
|
185
|
+
|
186
|
+
- `sinew.cached?(method, url, params = nil, body = nil)` - check if request is cached
|
187
|
+
- `sinew.uncache(method, url, params = nil, body = nil)` - remove cache file, if any
|
188
|
+
- `sinew.status(method, url, params = nil, body = nil)` - get httpdisk status
|
189
|
+
|
190
|
+
Plus some caching helpers in Sinew::Response:
|
191
|
+
|
192
|
+
- `diskpath` - the location on disk for the cached httpdisk response
|
193
|
+
- `uncache` - remove cache file for this response
|
140
194
|
|
141
195
|
## Hints
|
142
196
|
|
@@ -145,13 +199,15 @@ Writing Sinew recipes is fun and easy. The builtin caching means you can iterate
|
|
145
199
|
- Sinew doesn't (yet) check robots.txt - please check it manually.
|
146
200
|
- Prefer Nokogiri over regular expressions wherever possible. Learn [CSS selectors](http://www.w3schools.com/cssref/css_selectors.asp).
|
147
201
|
- In Chrome, `$` in the console is your friend.
|
148
|
-
- Fallback to regular expressions if you're desperate. Depending on the site, use either `
|
202
|
+
- Fallback to regular expressions if you're desperate. Depending on the site, use either `body` or `html`. `html` is probably your best bet. `body` is good for crawling Javascript, but it's fragile if the site changes.
|
149
203
|
- Learn to love `String#[regexp]`, which is an obscure operator but incredibly handy for Sinew.
|
150
204
|
- Laziness is useful. Keep your CSS selectors and regular expressions simple, so maybe they'll work again the next time you need to crawl a site.
|
151
205
|
- Don't be afraid to mix CSS selectors, regular expressions, and Ruby:
|
152
206
|
|
153
207
|
```ruby
|
154
|
-
noko.css("table")[4].css("td").select
|
208
|
+
noko.css("table")[4].css("td").select do
|
209
|
+
_1[:width].to_i > 80
|
210
|
+
end.map(&:text)
|
155
211
|
```
|
156
212
|
|
157
213
|
- Debug your recipes using plain old `puts`, or better yet use `ap` from [amazing_print](https://github.com/amazing-print/amazing_print).
|
@@ -165,6 +221,15 @@ noko.css("table")[4].css("td").select { |i| i[:width].to_i > 80 }.map(&:text)
|
|
165
221
|
|
166
222
|
## Changelog
|
167
223
|
|
224
|
+
#### 4.0.1 (Aug 2023)
|
225
|
+
|
226
|
+
- Updated dependencies, added justfile
|
227
|
+
|
228
|
+
#### 4.0.0 (Jul 2021)
|
229
|
+
|
230
|
+
- Rewritten to use simpler DSL
|
231
|
+
- Upgraded to httpdisk 0.5 to take advantage of the new encoding support
|
232
|
+
|
168
233
|
#### 3.0.0 (May 2021)
|
169
234
|
|
170
235
|
- Major rewrite of network and caching layer. See above.
|
data/Rakefile
CHANGED
@@ -1,53 +1,5 @@
|
|
1
|
-
require
|
1
|
+
require "bundler/setup"
|
2
|
+
require "minitest/test_task"
|
2
3
|
|
3
|
-
require 'rake/testtask'
|
4
|
-
require 'sinew/version'
|
5
|
-
|
6
|
-
# load the spec, we use it below
|
7
|
-
spec = Gem::Specification.load('sinew.gemspec')
|
8
|
-
|
9
|
-
#
|
10
|
-
# testing
|
11
|
-
# don't forget about TESTOPTS="--verbose" rake
|
12
|
-
# also: rake install && rm -rf ~/.sinew/www.amazon.com && /usr/local/bin/sinew sample.sinew
|
13
|
-
#
|
14
|
-
|
15
|
-
# test (default)
|
16
4
|
task default: :test
|
17
|
-
|
18
|
-
Rake::TestTask.new do
|
19
|
-
_1.libs << 'test'
|
20
|
-
_1.warning = false # sterile has a few issues here
|
21
|
-
end
|
22
|
-
|
23
|
-
# Watch rb files, run tests whenever something changes
|
24
|
-
task :watch do
|
25
|
-
# https://superuser.com/a/665208 / https://unix.stackexchange.com/a/42288
|
26
|
-
system("while true; do find . -name '*.rb' | entr -c -d rake; test $? -gt 128 && break; done")
|
27
|
-
end
|
28
|
-
|
29
|
-
#
|
30
|
-
# rubocop
|
31
|
-
#
|
32
|
-
|
33
|
-
task :rubocop do
|
34
|
-
system('bundle exec rubocop -A .', exception: true)
|
35
|
-
end
|
36
|
-
|
37
|
-
#
|
38
|
-
# gem
|
39
|
-
#
|
40
|
-
|
41
|
-
task :build do
|
42
|
-
system 'gem build --quiet sinew.gemspec', exception: true
|
43
|
-
end
|
44
|
-
|
45
|
-
task install: :build do
|
46
|
-
system "gem install --quiet sinew-#{spec.version}.gem", exception: true
|
47
|
-
end
|
48
|
-
|
49
|
-
task release: %i[rubocop test build] do
|
50
|
-
system "git tag -a #{spec.version} -m 'Tagging #{spec.version}'", exception: true
|
51
|
-
system 'git push --tags', exception: true
|
52
|
-
system "gem push sinew-#{spec.version}.gem", exception: true
|
53
|
-
end
|
5
|
+
Minitest::TestTask.create
|
data/bin/sinew
CHANGED
@@ -1,53 +1,25 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
$LOAD_PATH.unshift("
|
3
|
+
$LOAD_PATH.unshift(File.join(__dir__, "../lib"))
|
4
4
|
|
5
|
-
|
6
|
-
require 'slop'
|
5
|
+
BIN = File.basename($PROGRAM_NAME)
|
7
6
|
|
8
7
|
#
|
9
|
-
#
|
8
|
+
# Load the bare minimum and parse args with slop. For speed.
|
10
9
|
#
|
11
10
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
o.bool '--force-errors', "don't read errors from cache (but still write)"
|
20
|
-
o.string '--proxy', 'use host[:port] as HTTP proxy'
|
21
|
-
o.bool '--version', 'show version and exit'
|
22
|
-
o.on('--help', 'show this help') do
|
23
|
-
puts o
|
24
|
-
exit
|
25
|
-
end
|
11
|
+
require "sinew/args"
|
12
|
+
begin
|
13
|
+
slop = Sinew::Args.slop(ARGV)
|
14
|
+
rescue Slop::Error => e
|
15
|
+
warn "#{BIN}: #{e}" if e.message != ""
|
16
|
+
warn("#{BIN}: try '#{BIN} --help' for more information")
|
17
|
+
exit 1
|
26
18
|
end
|
27
19
|
|
28
|
-
if options[:version]
|
29
|
-
puts Sinew::VERSION
|
30
|
-
exit
|
31
|
-
end
|
32
|
-
|
33
|
-
#
|
34
|
-
# recipe
|
35
|
-
#
|
36
|
-
|
37
|
-
recipe = options.arguments.first
|
38
|
-
if !recipe
|
39
|
-
Scripto.fatal('need a .sinew file to run against')
|
40
|
-
end
|
41
|
-
if !File.exist?(recipe)
|
42
|
-
Scripto.fatal("#{recipe} not found")
|
43
|
-
end
|
44
|
-
if options.arguments.length > 1
|
45
|
-
Scripto.fatal('can only run on one .sinew file')
|
46
|
-
end
|
47
|
-
options = options.to_h.merge(recipe: recipe)
|
48
|
-
|
49
20
|
#
|
50
|
-
#
|
21
|
+
# now load everything and run
|
51
22
|
#
|
52
23
|
|
53
|
-
|
24
|
+
require "sinew"
|
25
|
+
Sinew::Main.new(slop).run
|