sinew 2.0.1 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/test.yml +26 -0
- data/.rubocop.yml +9 -6
- data/.vscode/settings.json +0 -10
- data/Gemfile +9 -0
- data/LICENSE +1 -1
- data/README.md +77 -58
- data/Rakefile +33 -18
- data/bin/sinew +8 -4
- data/lib/sinew.rb +0 -1
- data/lib/sinew/connection.rb +52 -0
- data/lib/sinew/connection/log_formatter.rb +22 -0
- data/lib/sinew/connection/rate_limit.rb +29 -0
- data/lib/sinew/core_ext.rb +1 -1
- data/lib/sinew/dsl.rb +27 -10
- data/lib/sinew/main.rb +7 -54
- data/lib/sinew/output.rb +26 -19
- data/lib/sinew/request.rb +28 -49
- data/lib/sinew/response.rb +25 -55
- data/lib/sinew/runtime_options.rb +4 -2
- data/lib/sinew/version.rb +1 -1
- data/sample.sinew +2 -2
- data/sinew.gemspec +16 -17
- metadata +41 -81
- data/.travis.yml +0 -4
- data/lib/sinew/cache.rb +0 -79
- data/test/legacy/eu.httpbin.org/head/redirect,3 +0 -51
- data/test/legacy/eu.httpbin.org/head/status,500 +0 -1
- data/test/legacy/eu.httpbin.org/redirect,3 +0 -11
- data/test/legacy/eu.httpbin.org/status,500 +0 -1
- data/test/legacy/legacy.sinew +0 -2
- data/test/test.html +0 -45
- data/test/test_cache.rb +0 -69
- data/test/test_helper.rb +0 -113
- data/test/test_legacy.rb +0 -21
- data/test/test_main.rb +0 -46
- data/test/test_nokogiri_ext.rb +0 -18
- data/test/test_output.rb +0 -73
- data/test/test_requests.rb +0 -135
- data/test/test_utf8.rb +0 -39
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: df55f2168ff4242fceb31d083b8d16f1046139fa7acb8a9c4fc3f06f7884e113
|
4
|
+
data.tar.gz: 520967eba4ea2d8446690736f2c28d34642b452c0f4e5003dcb89ce373c116e5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7443bccc5fc4e1bd112ce50b3d17445f0c21f5b351a6b5be586aadd63f36396312370ec6115d8116701165b3af19fcb852f85d18d0fbe7b4bf0d797312d3fa40
|
7
|
+
data.tar.gz: 9ca4f3c424e021100f518ca4f2231f515b38dbeb402ff4fce07c13a2440f19b0fabcc2a2920e51aa3f6228507550d3c94cb70a46fd22711ba3add90e4fc28004
|
@@ -0,0 +1,26 @@
|
|
1
|
+
name: test
|
2
|
+
|
3
|
+
on:
|
4
|
+
push:
|
5
|
+
paths-ignore:
|
6
|
+
- '**.md'
|
7
|
+
pull_request:
|
8
|
+
paths-ignore:
|
9
|
+
- '**.md'
|
10
|
+
workflow_dispatch:
|
11
|
+
|
12
|
+
jobs:
|
13
|
+
test:
|
14
|
+
strategy:
|
15
|
+
max-parallel: 3
|
16
|
+
matrix:
|
17
|
+
os: [ubuntu, macos]
|
18
|
+
ruby-version: [3.0, 2.7]
|
19
|
+
runs-on: ${{ matrix.os }}-latest
|
20
|
+
steps:
|
21
|
+
- uses: actions/checkout@v2
|
22
|
+
- uses: ruby/setup-ruby@v1
|
23
|
+
with:
|
24
|
+
ruby-version: ${{ matrix.ruby-version }}
|
25
|
+
- run: bundle install
|
26
|
+
- run: bundle exec rake test
|
data/.rubocop.yml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
AllCops:
|
2
|
-
|
3
|
-
|
2
|
+
TargetRubyVersion: 2.7
|
3
|
+
NewCops: enable
|
4
4
|
|
5
5
|
# amd: customizations
|
6
6
|
Layout/SpaceInsideArrayLiteralBrackets:
|
@@ -22,18 +22,19 @@ Style/TrailingCommaInHashLiteral:
|
|
22
22
|
|
23
23
|
# amd: these seem extreme
|
24
24
|
Lint/AssignmentInCondition: { Enabled: false } # I do this all the time
|
25
|
-
Lint/
|
25
|
+
Lint/SuppressedException: { Enabled: false } # blank rescues are useful
|
26
26
|
Naming/BinaryOperatorParameterName: { Enabled: false } # silly
|
27
27
|
Naming/HeredocDelimiterNaming: { Enabled: false } # silly
|
28
|
-
Naming/
|
29
|
-
|
30
|
-
|
28
|
+
Naming/MethodParameterName: { Enabled: false } # silly
|
29
|
+
Style/AccessorGrouping: { Enabled: false } # silly
|
30
|
+
Style/AsciiComments: { Enabled: false } # silly
|
31
31
|
Style/ClassAndModuleChildren: { Enabled: false } # silly
|
32
32
|
Style/Documentation: { Enabled: false } # we don't need this
|
33
33
|
Style/DoubleNegation: { Enabled: false } # silly
|
34
34
|
Style/FormatStringToken: { Enabled: false } # we like printf here
|
35
35
|
Style/FrozenStringLiteralComment: { Enabled: false } # seems excessive
|
36
36
|
Style/GuardClause: { Enabled: false } # confusing
|
37
|
+
Style/HashTransformValues: { Enabled: false } # breaks code by trying to apply to an array
|
37
38
|
Style/IfUnlessModifier: { Enabled: false } # personally I hate unless
|
38
39
|
Style/NegatedIf: { Enabled: false } # these are fine
|
39
40
|
Style/Next: { Enabled: false } # these are fine
|
@@ -41,7 +42,9 @@ Style/NumericPredicate: { Enabled: false } # silly
|
|
41
42
|
Style/ParallelAssignment: { Enabled: false } # these are fine
|
42
43
|
Style/PerlBackrefs: { Enabled: false } # these are fine
|
43
44
|
Style/RaiseArgs: { Enabled: false } # silly
|
45
|
+
Style/RedundantAssignment: { Enabled: false } # these are usually on purpose
|
44
46
|
Style/RegexpLiteral: { Enabled: false } # these are fine
|
47
|
+
Style/SoleNestedConditional: { Enabled: false } # these are fine
|
45
48
|
Style/StderrPuts: { Enabled: false } # this is awful
|
46
49
|
|
47
50
|
# amd: these Metric rules are annoying, disable
|
data/.vscode/settings.json
CHANGED
@@ -1,15 +1,5 @@
|
|
1
1
|
{
|
2
|
-
"editor.formatOnSave": true,
|
3
|
-
"editor.formatOnSaveTimeout": 1500,
|
4
|
-
"editor.tabSize": 2,
|
5
|
-
"editor.wordSeparators": "`~#$%^&*()-=+[{]}\\|;:'\",.<>/",
|
6
2
|
"files.associations": {
|
7
3
|
"*.sinew": "ruby"
|
8
|
-
},
|
9
|
-
"files.insertFinalNewline": true,
|
10
|
-
"files.trimTrailingWhitespace": true,
|
11
|
-
"ruby.format": "rubocop",
|
12
|
-
"ruby.lint": {
|
13
|
-
"rubocop": true
|
14
4
|
}
|
15
5
|
}
|
data/Gemfile
CHANGED
data/LICENSE
CHANGED
data/README.md
CHANGED
@@ -1,11 +1,13 @@
|
|
1
|
+
[](https://github.com/gurgeous/sinew/action)
|
2
|
+
|
1
3
|
## Welcome to Sinew
|
2
4
|
|
3
5
|
Sinew collects structured data from web sites (screen scraping). It provides a Ruby DSL built for crawling, a robust caching system, and integration with [Nokogiri](http://nokogiri.org). Though small, this project is the culmination of years of effort based on crawling systems built at several different companies.
|
4
6
|
|
5
7
|
Sinew is distributed as a ruby gem:
|
6
8
|
|
7
|
-
```
|
8
|
-
gem install sinew
|
9
|
+
```sh
|
10
|
+
$ gem install sinew
|
9
11
|
```
|
10
12
|
|
11
13
|
or in your Gemfile:
|
@@ -16,39 +18,32 @@ gem 'sinew'
|
|
16
18
|
|
17
19
|
## Table of Contents
|
18
20
|
|
19
|
-
<!---
|
20
|
-
markdown-toc --no-firsth1 --maxdepth 1 readme.md
|
21
|
-
-->
|
21
|
+
<!--- markdown-toc --no-firsth1 --maxdepth 1 readme.md -->
|
22
22
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
23
|
+
- [Sinew 3](#sinew-3-may-2021)
|
24
|
+
- [Quick Example](#quick-example)
|
25
|
+
- [How it Works](#how-it-works)
|
26
|
+
- [DSL Reference](#dsl-reference)
|
27
|
+
- [Hints](#hints)
|
28
|
+
- [Limitations](#limitations)
|
29
|
+
- [Changelog](#changelog)
|
30
|
+
- [License](#license)
|
30
31
|
|
31
|
-
## Sinew
|
32
|
+
## Sinew 3 (May 2021)
|
32
33
|
|
33
|
-
I am pleased to announce the release of Sinew
|
34
|
-
|
35
|
-
* Remove dependencies on active_support, curl and tidy. We use HTTParty now.
|
36
|
-
* Much easier to customize requests in `.sinew` files. For example, setting User-Agent or Bearer tokens.
|
37
|
-
* More operations like `post_json` or the generic `http`. These methods are thing wrappers around HTTParty.
|
38
|
-
* New end-of-run report.
|
39
|
-
* Tests, rubocop, vscode settings, travis, etc.
|
34
|
+
I am pleased to announce the release of Sinew 3.0. Sinew has been streamlined and updated to use the [Faraday](https://lostisland.github.io/faraday/) HTTP client with [sinew](https://github.com/gurgeous/sinew/) middleware for caching.
|
40
35
|
|
41
36
|
**Breaking change**
|
42
37
|
|
43
|
-
Sinew uses a new format for cached responses. Old Sinew
|
38
|
+
Sinew 3 uses a new format for cached responses. Old Sinew 2 cache directories should be removed before running Sinew again.
|
44
39
|
|
45
40
|
## Quick Example
|
46
41
|
|
47
|
-
Here's an example for collecting the links from
|
42
|
+
Here's an example for collecting the links from httpbingo.org:
|
48
43
|
|
49
44
|
```ruby
|
50
45
|
# get the url
|
51
|
-
get "http://
|
46
|
+
get "http://httpbingo.org"
|
52
47
|
|
53
48
|
# use nokogiri to collect links
|
54
49
|
noko.css("ul li a").each do |a|
|
@@ -113,9 +108,9 @@ Sinew creates a CSV file with the same name as the recipe, and `csv_emit(hash)`
|
|
113
108
|
|
114
109
|
#### Caching
|
115
110
|
|
116
|
-
|
111
|
+
Sinew uses [sinew](https://github.com/gurgeous/sinew/) to aggressively cache all HTTP responses to disk in `~/.sinew`. Error responses are cached as well. Each URL will be hit exactly once, and requests are rate limited to one per second. Sinew tries to be polite.
|
117
112
|
|
118
|
-
|
113
|
+
Sinew never deletes files from the cache - that's up to you!
|
119
114
|
|
120
115
|
Because all requests are cached, you can run Sinew repeatedly with confidence. Run it over and over again while you build up your recipe.
|
121
116
|
|
@@ -123,65 +118,89 @@ Because all requests are cached, you can run Sinew repeatedly with confidence. R
|
|
123
118
|
|
124
119
|
#### Making requests
|
125
120
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
121
|
+
- `get(url, query = {})` - fetch a url with HTTP GET. URL parameters can be added using `query.
|
122
|
+
- `post(url, form = {})` - fetch a url with HTTP POST, using `form` as the URL encoded POST body.
|
123
|
+
- `post_json(url, json = {})` - fetch a url with HTTP POST, using `json` as the POST body.
|
124
|
+
- `http(method, url, options = {})` - use this for more complex requests
|
130
125
|
|
131
126
|
#### Parsing the response
|
132
127
|
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
128
|
+
These variables are set after each HTTP request.
|
129
|
+
|
130
|
+
- `raw` - the raw response from the last request
|
131
|
+
- `html` - like `raw`, but with a handful of HTML-specific whitespace cleanups
|
132
|
+
- `noko` - parse the response as HTML and return a [Nokogiri](http://nokogiri.org) document
|
133
|
+
- `xml` - parse the response as XML and return a [Nokogiri](http://nokogiri.org) document
|
134
|
+
- `json` - parse the response as JSON, with symbolized keys
|
135
|
+
- `url` - the url of the last request. If the request goes through a redirect, `url` will reflect the final url.
|
136
|
+
- `uri` - the URI of the last request. This is useful for resolving relative URLs.
|
139
137
|
|
140
138
|
#### Writing CSV
|
141
139
|
|
142
|
-
|
143
|
-
|
140
|
+
- `csv_header(keys)` - specify the columns for CSV output. If you don't call this, Sinew will use the keys from the first call to `csv_emit`.
|
141
|
+
- `csv_emit(hash)` - append a row to the CSV file
|
144
142
|
|
145
143
|
## Hints
|
146
144
|
|
147
145
|
Writing Sinew recipes is fun and easy. The builtin caching means you can iterate quickly, since you won't have to re-fetch the data. Here are some hints for writing idiomatic recipes:
|
148
146
|
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
147
|
+
- Sinew doesn't (yet) check robots.txt - please check it manually.
|
148
|
+
- Prefer Nokogiri over regular expressions wherever possible. Learn [CSS selectors](http://www.w3schools.com/cssref/css_selectors.asp).
|
149
|
+
- In Chrome, `$` in the console is your friend.
|
150
|
+
- Fallback to regular expressions if you're desperate. Depending on the site, use either `raw` or `html`. `html` is probably your best bet. `raw` is good for crawling Javascript, but it's fragile if the site changes.
|
151
|
+
- Learn to love `String#[regexp]`, which is an obscure operator but incredibly handy for Sinew.
|
152
|
+
- Laziness is useful. Keep your CSS selectors and regular expressions simple, so maybe they'll work again the next time you need to crawl a site.
|
153
|
+
- Don't be afraid to mix CSS selectors, regular expressions, and Ruby:
|
156
154
|
|
157
155
|
```ruby
|
158
156
|
noko.css("table")[4].css("td").select { |i| i[:width].to_i > 80 }.map(&:text)
|
159
157
|
```
|
160
158
|
|
161
|
-
|
162
|
-
|
163
|
-
|
159
|
+
- Debug your recipes using plain old `puts`, or better yet use `ap` from [amazing_print](https://github.com/amazing-print/amazing_print).
|
160
|
+
- Run `sinew -v` to get a report on every `csv_emit`. Very handy.
|
161
|
+
- Add the CSV files to your git repo. That way you can version them and get diffs!
|
164
162
|
|
165
163
|
## Limitations
|
166
164
|
|
167
|
-
|
168
|
-
|
165
|
+
- Caching is based on URL, so use caution with cookies and other forms of authentication
|
166
|
+
- Almost no support for international (non-english) characters
|
169
167
|
|
170
168
|
## Changelog
|
171
169
|
|
172
|
-
####
|
170
|
+
#### 3.0.0 (May 2021)
|
171
|
+
|
172
|
+
- Major rewrite of network and caching layer. See above.
|
173
|
+
- Use Faraday HTTP client with sinew middleware for caching.
|
174
|
+
- Supports multiple proxies (`--proxy host1,host2,...`)
|
175
|
+
|
176
|
+
#### 2.0.4 (May 2018)
|
173
177
|
|
174
|
-
|
178
|
+
- Handle and cache more errors (too many redirects, connection failures, etc.)
|
179
|
+
- Support for adding uri.scheme in generate_cache_key
|
180
|
+
- Added status `code`, a peer to `uri`, `raw`, etc.
|
175
181
|
|
176
|
-
####
|
182
|
+
#### 2.0.3 (May 2018)
|
183
|
+
|
184
|
+
- & now normalizes to & (not and)
|
185
|
+
|
186
|
+
#### 2.0.2 (May 2018)
|
187
|
+
|
188
|
+
- Support for `--limit`, `--proxy` and the `xml` variable
|
189
|
+
- Dedup - warn and ignore if row[:url] has already been emitted
|
190
|
+
- Auto gunzip if contents are compressed
|
191
|
+
|
192
|
+
#### 2.0.1 (May 2018)
|
193
|
+
|
194
|
+
- Support for legacy cached `head` files from Sinew 1
|
195
|
+
|
196
|
+
#### 2.0.0 (May 2018)
|
177
197
|
|
178
|
-
|
198
|
+
- Complete rewrite. See above.
|
179
199
|
|
180
|
-
#### 1.0.
|
200
|
+
#### 1.0.3 (June 2012)
|
181
201
|
|
182
|
-
|
202
|
+
...
|
183
203
|
|
184
|
-
|
204
|
+
## License
|
185
205
|
|
186
|
-
|
187
|
-
* Added first batch of unit tests
|
206
|
+
This extension is [licensed under the MIT License](LICENSE).
|
data/Rakefile
CHANGED
@@ -1,38 +1,53 @@
|
|
1
|
-
require 'bundler'
|
2
1
|
require 'bundler/setup'
|
3
2
|
|
4
|
-
require 'rake'
|
5
3
|
require 'rake/testtask'
|
6
4
|
require 'sinew/version'
|
7
5
|
|
6
|
+
# load the spec, we use it below
|
7
|
+
spec = Gem::Specification.load('sinew.gemspec')
|
8
|
+
|
8
9
|
#
|
9
|
-
#
|
10
|
+
# testing
|
11
|
+
# don't forget about TESTOPTS="--verbose" rake
|
12
|
+
# also: rake install && rm -rf ~/.sinew/www.amazon.com && /usr/local/bin/sinew sample.sinew
|
10
13
|
#
|
11
14
|
|
12
|
-
|
13
|
-
task :
|
14
|
-
|
15
|
+
# test (default)
|
16
|
+
task default: :test
|
17
|
+
|
18
|
+
Rake::TestTask.new do
|
19
|
+
_1.libs << 'test'
|
20
|
+
_1.warning = false # sterile has a few issues here
|
15
21
|
end
|
16
22
|
|
17
|
-
|
18
|
-
|
23
|
+
# Watch rb files, run tests whenever something changes
|
24
|
+
task :watch do
|
25
|
+
# https://superuser.com/a/665208 / https://unix.stackexchange.com/a/42288
|
26
|
+
system("while true; do find . -name '*.rb' | entr -c -d rake; test $? -gt 128 && break; done")
|
19
27
|
end
|
20
28
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
29
|
+
#
|
30
|
+
# rubocop
|
31
|
+
#
|
32
|
+
|
33
|
+
task :rubocop do
|
34
|
+
system('bundle exec rubocop -A .', exception: true)
|
25
35
|
end
|
26
36
|
|
27
37
|
#
|
28
|
-
#
|
38
|
+
# gem
|
29
39
|
#
|
30
40
|
|
31
|
-
|
32
|
-
|
41
|
+
task :build do
|
42
|
+
system 'gem build --quiet sinew.gemspec', exception: true
|
33
43
|
end
|
34
44
|
|
35
|
-
task
|
45
|
+
task install: :build do
|
46
|
+
system "gem install --quiet sinew-#{spec.version}.gem", exception: true
|
47
|
+
end
|
36
48
|
|
37
|
-
|
38
|
-
|
49
|
+
task release: %i[rubocop test build] do
|
50
|
+
system "git tag -a #{spec.version} -m 'Tagging #{spec.version}'", exception: true
|
51
|
+
system 'git push --tags', exception: true
|
52
|
+
system "gem push sinew-#{spec.version}.gem", exception: true
|
53
|
+
end
|
data/bin/sinew
CHANGED
@@ -11,11 +11,15 @@ require 'slop'
|
|
11
11
|
|
12
12
|
options = Slop.parse do |o|
|
13
13
|
o.banner = 'Usage: sinew [options] <gub.sinew>'
|
14
|
-
o.bool '-v', '--verbose', 'dump
|
15
|
-
o.bool '--version', 'show version'
|
14
|
+
o.bool '-v', '--verbose', 'dump emitted rows while running'
|
16
15
|
o.bool '-q', '--quiet', 'suppress some output'
|
17
|
-
o.
|
18
|
-
o.
|
16
|
+
o.integer '-l', '--limit', 'quit after emitting this many rows'
|
17
|
+
o.string '-c', '--cache', 'set custom cache directory', default: "#{ENV['HOME']}/.sinew"
|
18
|
+
o.bool '--force', "don't read anything from cache (but still write)"
|
19
|
+
o.bool '--force-errors', "don't read errors from cache (but still write)"
|
20
|
+
o.string '--proxy', 'use host[:port] as HTTP proxy'
|
21
|
+
o.bool '--version', 'show version and exit'
|
22
|
+
o.on('--help', 'show this help') do
|
19
23
|
puts o
|
20
24
|
exit
|
21
25
|
end
|
data/lib/sinew.rb
CHANGED
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'faraday'
|
2
|
+
require 'faraday-encoding'
|
3
|
+
require 'faraday/logging/formatter'
|
4
|
+
require 'httpdisk'
|
5
|
+
require 'sinew/connection/log_formatter'
|
6
|
+
require 'sinew/connection/rate_limit'
|
7
|
+
|
8
|
+
module Sinew
|
9
|
+
module Connection
|
10
|
+
def self.create(options:, runtime_options:)
|
11
|
+
connection_options = {}
|
12
|
+
connection_options[:ssl] = { verify: false } if runtime_options.insecure
|
13
|
+
|
14
|
+
Faraday.new(nil, connection_options) do
|
15
|
+
_1.use RateLimit, rate_limit: runtime_options.rate_limit
|
16
|
+
|
17
|
+
# auto-encode form bodies
|
18
|
+
_1.request :url_encoded
|
19
|
+
|
20
|
+
# Before httpdisk so each redirect segment is cached
|
21
|
+
# Keep track of redirect status for logger
|
22
|
+
_1.response :follow_redirects, callback: ->(_old_env, new_env) { new_env[:redirect] = true }
|
23
|
+
|
24
|
+
# set Ruby string encoding based on Content-Type (should be above httpdisk)
|
25
|
+
_1.response :encoding
|
26
|
+
|
27
|
+
# disk caching
|
28
|
+
httpdisk_options = {
|
29
|
+
dir: options[:cache],
|
30
|
+
force: options[:force],
|
31
|
+
force_errors: options[:force_errors],
|
32
|
+
}.merge(runtime_options.httpdisk_options)
|
33
|
+
|
34
|
+
_1.use :httpdisk, httpdisk_options
|
35
|
+
|
36
|
+
# After httpdisk so that only non-cached requests are logged.
|
37
|
+
# Before retry so that we don't log each retry attempt.
|
38
|
+
_1.response :logger, nil, formatter: LogFormatter if !options[:quiet]
|
39
|
+
|
40
|
+
# After httpdisk so transient failures are not cached
|
41
|
+
retry_options = {
|
42
|
+
interval: runtime_options.rate_limit,
|
43
|
+
max: runtime_options.retries,
|
44
|
+
methods: %w[delete get head options patch post put trace],
|
45
|
+
retry_statuses: (500..600).to_a,
|
46
|
+
retry_if: ->(_env, _err) { true },
|
47
|
+
}
|
48
|
+
_1.request :retry, retry_options
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|