sinew 2.0.1 → 3.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/test.yml +26 -0
- data/.rubocop.yml +9 -6
- data/.vscode/settings.json +0 -10
- data/Gemfile +9 -0
- data/LICENSE +1 -1
- data/README.md +77 -58
- data/Rakefile +33 -18
- data/bin/sinew +8 -4
- data/lib/sinew.rb +0 -1
- data/lib/sinew/connection.rb +52 -0
- data/lib/sinew/connection/log_formatter.rb +22 -0
- data/lib/sinew/connection/rate_limit.rb +29 -0
- data/lib/sinew/core_ext.rb +1 -1
- data/lib/sinew/dsl.rb +27 -10
- data/lib/sinew/main.rb +7 -54
- data/lib/sinew/output.rb +26 -19
- data/lib/sinew/request.rb +28 -49
- data/lib/sinew/response.rb +25 -55
- data/lib/sinew/runtime_options.rb +4 -2
- data/lib/sinew/version.rb +1 -1
- data/sample.sinew +2 -2
- data/sinew.gemspec +16 -17
- metadata +41 -81
- data/.travis.yml +0 -4
- data/lib/sinew/cache.rb +0 -79
- data/test/legacy/eu.httpbin.org/head/redirect,3 +0 -51
- data/test/legacy/eu.httpbin.org/head/status,500 +0 -1
- data/test/legacy/eu.httpbin.org/redirect,3 +0 -11
- data/test/legacy/eu.httpbin.org/status,500 +0 -1
- data/test/legacy/legacy.sinew +0 -2
- data/test/test.html +0 -45
- data/test/test_cache.rb +0 -69
- data/test/test_helper.rb +0 -113
- data/test/test_legacy.rb +0 -21
- data/test/test_main.rb +0 -46
- data/test/test_nokogiri_ext.rb +0 -18
- data/test/test_output.rb +0 -73
- data/test/test_requests.rb +0 -135
- data/test/test_utf8.rb +0 -39
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: df55f2168ff4242fceb31d083b8d16f1046139fa7acb8a9c4fc3f06f7884e113
|
4
|
+
data.tar.gz: 520967eba4ea2d8446690736f2c28d34642b452c0f4e5003dcb89ce373c116e5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7443bccc5fc4e1bd112ce50b3d17445f0c21f5b351a6b5be586aadd63f36396312370ec6115d8116701165b3af19fcb852f85d18d0fbe7b4bf0d797312d3fa40
|
7
|
+
data.tar.gz: 9ca4f3c424e021100f518ca4f2231f515b38dbeb402ff4fce07c13a2440f19b0fabcc2a2920e51aa3f6228507550d3c94cb70a46fd22711ba3add90e4fc28004
|
@@ -0,0 +1,26 @@
|
|
1
|
+
name: test
|
2
|
+
|
3
|
+
on:
|
4
|
+
push:
|
5
|
+
paths-ignore:
|
6
|
+
- '**.md'
|
7
|
+
pull_request:
|
8
|
+
paths-ignore:
|
9
|
+
- '**.md'
|
10
|
+
workflow_dispatch:
|
11
|
+
|
12
|
+
jobs:
|
13
|
+
test:
|
14
|
+
strategy:
|
15
|
+
max-parallel: 3
|
16
|
+
matrix:
|
17
|
+
os: [ubuntu, macos]
|
18
|
+
ruby-version: [3.0, 2.7]
|
19
|
+
runs-on: ${{ matrix.os }}-latest
|
20
|
+
steps:
|
21
|
+
- uses: actions/checkout@v2
|
22
|
+
- uses: ruby/setup-ruby@v1
|
23
|
+
with:
|
24
|
+
ruby-version: ${{ matrix.ruby-version }}
|
25
|
+
- run: bundle install
|
26
|
+
- run: bundle exec rake test
|
data/.rubocop.yml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
AllCops:
|
2
|
-
|
3
|
-
|
2
|
+
TargetRubyVersion: 2.7
|
3
|
+
NewCops: enable
|
4
4
|
|
5
5
|
# amd: customizations
|
6
6
|
Layout/SpaceInsideArrayLiteralBrackets:
|
@@ -22,18 +22,19 @@ Style/TrailingCommaInHashLiteral:
|
|
22
22
|
|
23
23
|
# amd: these seem extreme
|
24
24
|
Lint/AssignmentInCondition: { Enabled: false } # I do this all the time
|
25
|
-
Lint/
|
25
|
+
Lint/SuppressedException: { Enabled: false } # blank rescues are useful
|
26
26
|
Naming/BinaryOperatorParameterName: { Enabled: false } # silly
|
27
27
|
Naming/HeredocDelimiterNaming: { Enabled: false } # silly
|
28
|
-
Naming/
|
29
|
-
|
30
|
-
|
28
|
+
Naming/MethodParameterName: { Enabled: false } # silly
|
29
|
+
Style/AccessorGrouping: { Enabled: false } # silly
|
30
|
+
Style/AsciiComments: { Enabled: false } # silly
|
31
31
|
Style/ClassAndModuleChildren: { Enabled: false } # silly
|
32
32
|
Style/Documentation: { Enabled: false } # we don't need this
|
33
33
|
Style/DoubleNegation: { Enabled: false } # silly
|
34
34
|
Style/FormatStringToken: { Enabled: false } # we like printf here
|
35
35
|
Style/FrozenStringLiteralComment: { Enabled: false } # seems excessive
|
36
36
|
Style/GuardClause: { Enabled: false } # confusing
|
37
|
+
Style/HashTransformValues: { Enabled: false } # breaks code by trying to apply to an array
|
37
38
|
Style/IfUnlessModifier: { Enabled: false } # personally I hate unless
|
38
39
|
Style/NegatedIf: { Enabled: false } # these are fine
|
39
40
|
Style/Next: { Enabled: false } # these are fine
|
@@ -41,7 +42,9 @@ Style/NumericPredicate: { Enabled: false } # silly
|
|
41
42
|
Style/ParallelAssignment: { Enabled: false } # these are fine
|
42
43
|
Style/PerlBackrefs: { Enabled: false } # these are fine
|
43
44
|
Style/RaiseArgs: { Enabled: false } # silly
|
45
|
+
Style/RedundantAssignment: { Enabled: false } # these are usually on purpose
|
44
46
|
Style/RegexpLiteral: { Enabled: false } # these are fine
|
47
|
+
Style/SoleNestedConditional: { Enabled: false } # these are fine
|
45
48
|
Style/StderrPuts: { Enabled: false } # this is awful
|
46
49
|
|
47
50
|
# amd: these Metric rules are annoying, disable
|
data/.vscode/settings.json
CHANGED
@@ -1,15 +1,5 @@
|
|
1
1
|
{
|
2
|
-
"editor.formatOnSave": true,
|
3
|
-
"editor.formatOnSaveTimeout": 1500,
|
4
|
-
"editor.tabSize": 2,
|
5
|
-
"editor.wordSeparators": "`~#$%^&*()-=+[{]}\\|;:'\",.<>/",
|
6
2
|
"files.associations": {
|
7
3
|
"*.sinew": "ruby"
|
8
|
-
},
|
9
|
-
"files.insertFinalNewline": true,
|
10
|
-
"files.trimTrailingWhitespace": true,
|
11
|
-
"ruby.format": "rubocop",
|
12
|
-
"ruby.lint": {
|
13
|
-
"rubocop": true
|
14
4
|
}
|
15
5
|
}
|
data/Gemfile
CHANGED
data/LICENSE
CHANGED
data/README.md
CHANGED
@@ -1,11 +1,13 @@
|
|
1
|
+
[![Build Status](https://github.com/gurgeous/sinew/workflows/test/badge.svg?branch=master)](https://github.com/gurgeous/sinew/action)
|
2
|
+
|
1
3
|
## Welcome to Sinew
|
2
4
|
|
3
5
|
Sinew collects structured data from web sites (screen scraping). It provides a Ruby DSL built for crawling, a robust caching system, and integration with [Nokogiri](http://nokogiri.org). Though small, this project is the culmination of years of effort based on crawling systems built at several different companies.
|
4
6
|
|
5
7
|
Sinew is distributed as a ruby gem:
|
6
8
|
|
7
|
-
```
|
8
|
-
gem install sinew
|
9
|
+
```sh
|
10
|
+
$ gem install sinew
|
9
11
|
```
|
10
12
|
|
11
13
|
or in your Gemfile:
|
@@ -16,39 +18,32 @@ gem 'sinew'
|
|
16
18
|
|
17
19
|
## Table of Contents
|
18
20
|
|
19
|
-
<!---
|
20
|
-
markdown-toc --no-firsth1 --maxdepth 1 readme.md
|
21
|
-
-->
|
21
|
+
<!--- markdown-toc --no-firsth1 --maxdepth 1 readme.md -->
|
22
22
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
23
|
+
- [Sinew 3](#sinew-3-may-2021)
|
24
|
+
- [Quick Example](#quick-example)
|
25
|
+
- [How it Works](#how-it-works)
|
26
|
+
- [DSL Reference](#dsl-reference)
|
27
|
+
- [Hints](#hints)
|
28
|
+
- [Limitations](#limitations)
|
29
|
+
- [Changelog](#changelog)
|
30
|
+
- [License](#license)
|
30
31
|
|
31
|
-
## Sinew
|
32
|
+
## Sinew 3 (May 2021)
|
32
33
|
|
33
|
-
I am pleased to announce the release of Sinew
|
34
|
-
|
35
|
-
* Remove dependencies on active_support, curl and tidy. We use HTTParty now.
|
36
|
-
* Much easier to customize requests in `.sinew` files. For example, setting User-Agent or Bearer tokens.
|
37
|
-
* More operations like `post_json` or the generic `http`. These methods are thing wrappers around HTTParty.
|
38
|
-
* New end-of-run report.
|
39
|
-
* Tests, rubocop, vscode settings, travis, etc.
|
34
|
+
I am pleased to announce the release of Sinew 3.0. Sinew has been streamlined and updated to use the [Faraday](https://lostisland.github.io/faraday/) HTTP client with [sinew](https://github.com/gurgeous/sinew/) middleware for caching.
|
40
35
|
|
41
36
|
**Breaking change**
|
42
37
|
|
43
|
-
Sinew uses a new format for cached responses. Old Sinew
|
38
|
+
Sinew 3 uses a new format for cached responses. Old Sinew 2 cache directories should be removed before running Sinew again.
|
44
39
|
|
45
40
|
## Quick Example
|
46
41
|
|
47
|
-
Here's an example for collecting the links from
|
42
|
+
Here's an example for collecting the links from httpbingo.org:
|
48
43
|
|
49
44
|
```ruby
|
50
45
|
# get the url
|
51
|
-
get "http://
|
46
|
+
get "http://httpbingo.org"
|
52
47
|
|
53
48
|
# use nokogiri to collect links
|
54
49
|
noko.css("ul li a").each do |a|
|
@@ -113,9 +108,9 @@ Sinew creates a CSV file with the same name as the recipe, and `csv_emit(hash)`
|
|
113
108
|
|
114
109
|
#### Caching
|
115
110
|
|
116
|
-
|
111
|
+
Sinew uses [sinew](https://github.com/gurgeous/sinew/) to aggressively cache all HTTP responses to disk in `~/.sinew`. Error responses are cached as well. Each URL will be hit exactly once, and requests are rate limited to one per second. Sinew tries to be polite.
|
117
112
|
|
118
|
-
|
113
|
+
Sinew never deletes files from the cache - that's up to you!
|
119
114
|
|
120
115
|
Because all requests are cached, you can run Sinew repeatedly with confidence. Run it over and over again while you build up your recipe.
|
121
116
|
|
@@ -123,65 +118,89 @@ Because all requests are cached, you can run Sinew repeatedly with confidence. R
|
|
123
118
|
|
124
119
|
#### Making requests
|
125
120
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
121
|
+
- `get(url, query = {})` - fetch a url with HTTP GET. URL parameters can be added using `query.
|
122
|
+
- `post(url, form = {})` - fetch a url with HTTP POST, using `form` as the URL encoded POST body.
|
123
|
+
- `post_json(url, json = {})` - fetch a url with HTTP POST, using `json` as the POST body.
|
124
|
+
- `http(method, url, options = {})` - use this for more complex requests
|
130
125
|
|
131
126
|
#### Parsing the response
|
132
127
|
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
128
|
+
These variables are set after each HTTP request.
|
129
|
+
|
130
|
+
- `raw` - the raw response from the last request
|
131
|
+
- `html` - like `raw`, but with a handful of HTML-specific whitespace cleanups
|
132
|
+
- `noko` - parse the response as HTML and return a [Nokogiri](http://nokogiri.org) document
|
133
|
+
- `xml` - parse the response as XML and return a [Nokogiri](http://nokogiri.org) document
|
134
|
+
- `json` - parse the response as JSON, with symbolized keys
|
135
|
+
- `url` - the url of the last request. If the request goes through a redirect, `url` will reflect the final url.
|
136
|
+
- `uri` - the URI of the last request. This is useful for resolving relative URLs.
|
139
137
|
|
140
138
|
#### Writing CSV
|
141
139
|
|
142
|
-
|
143
|
-
|
140
|
+
- `csv_header(keys)` - specify the columns for CSV output. If you don't call this, Sinew will use the keys from the first call to `csv_emit`.
|
141
|
+
- `csv_emit(hash)` - append a row to the CSV file
|
144
142
|
|
145
143
|
## Hints
|
146
144
|
|
147
145
|
Writing Sinew recipes is fun and easy. The builtin caching means you can iterate quickly, since you won't have to re-fetch the data. Here are some hints for writing idiomatic recipes:
|
148
146
|
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
147
|
+
- Sinew doesn't (yet) check robots.txt - please check it manually.
|
148
|
+
- Prefer Nokogiri over regular expressions wherever possible. Learn [CSS selectors](http://www.w3schools.com/cssref/css_selectors.asp).
|
149
|
+
- In Chrome, `$` in the console is your friend.
|
150
|
+
- Fallback to regular expressions if you're desperate. Depending on the site, use either `raw` or `html`. `html` is probably your best bet. `raw` is good for crawling Javascript, but it's fragile if the site changes.
|
151
|
+
- Learn to love `String#[regexp]`, which is an obscure operator but incredibly handy for Sinew.
|
152
|
+
- Laziness is useful. Keep your CSS selectors and regular expressions simple, so maybe they'll work again the next time you need to crawl a site.
|
153
|
+
- Don't be afraid to mix CSS selectors, regular expressions, and Ruby:
|
156
154
|
|
157
155
|
```ruby
|
158
156
|
noko.css("table")[4].css("td").select { |i| i[:width].to_i > 80 }.map(&:text)
|
159
157
|
```
|
160
158
|
|
161
|
-
|
162
|
-
|
163
|
-
|
159
|
+
- Debug your recipes using plain old `puts`, or better yet use `ap` from [amazing_print](https://github.com/amazing-print/amazing_print).
|
160
|
+
- Run `sinew -v` to get a report on every `csv_emit`. Very handy.
|
161
|
+
- Add the CSV files to your git repo. That way you can version them and get diffs!
|
164
162
|
|
165
163
|
## Limitations
|
166
164
|
|
167
|
-
|
168
|
-
|
165
|
+
- Caching is based on URL, so use caution with cookies and other forms of authentication
|
166
|
+
- Almost no support for international (non-english) characters
|
169
167
|
|
170
168
|
## Changelog
|
171
169
|
|
172
|
-
####
|
170
|
+
#### 3.0.0 (May 2021)
|
171
|
+
|
172
|
+
- Major rewrite of network and caching layer. See above.
|
173
|
+
- Use Faraday HTTP client with sinew middleware for caching.
|
174
|
+
- Supports multiple proxies (`--proxy host1,host2,...`)
|
175
|
+
|
176
|
+
#### 2.0.4 (May 2018)
|
173
177
|
|
174
|
-
|
178
|
+
- Handle and cache more errors (too many redirects, connection failures, etc.)
|
179
|
+
- Support for adding uri.scheme in generate_cache_key
|
180
|
+
- Added status `code`, a peer to `uri`, `raw`, etc.
|
175
181
|
|
176
|
-
####
|
182
|
+
#### 2.0.3 (May 2018)
|
183
|
+
|
184
|
+
- & now normalizes to & (not and)
|
185
|
+
|
186
|
+
#### 2.0.2 (May 2018)
|
187
|
+
|
188
|
+
- Support for `--limit`, `--proxy` and the `xml` variable
|
189
|
+
- Dedup - warn and ignore if row[:url] has already been emitted
|
190
|
+
- Auto gunzip if contents are compressed
|
191
|
+
|
192
|
+
#### 2.0.1 (May 2018)
|
193
|
+
|
194
|
+
- Support for legacy cached `head` files from Sinew 1
|
195
|
+
|
196
|
+
#### 2.0.0 (May 2018)
|
177
197
|
|
178
|
-
|
198
|
+
- Complete rewrite. See above.
|
179
199
|
|
180
|
-
#### 1.0.
|
200
|
+
#### 1.0.3 (June 2012)
|
181
201
|
|
182
|
-
|
202
|
+
...
|
183
203
|
|
184
|
-
|
204
|
+
## License
|
185
205
|
|
186
|
-
|
187
|
-
* Added first batch of unit tests
|
206
|
+
This extension is [licensed under the MIT License](LICENSE).
|
data/Rakefile
CHANGED
@@ -1,38 +1,53 @@
|
|
1
|
-
require 'bundler'
|
2
1
|
require 'bundler/setup'
|
3
2
|
|
4
|
-
require 'rake'
|
5
3
|
require 'rake/testtask'
|
6
4
|
require 'sinew/version'
|
7
5
|
|
6
|
+
# load the spec, we use it below
|
7
|
+
spec = Gem::Specification.load('sinew.gemspec')
|
8
|
+
|
8
9
|
#
|
9
|
-
#
|
10
|
+
# testing
|
11
|
+
# don't forget about TESTOPTS="--verbose" rake
|
12
|
+
# also: rake install && rm -rf ~/.sinew/www.amazon.com && /usr/local/bin/sinew sample.sinew
|
10
13
|
#
|
11
14
|
|
12
|
-
|
13
|
-
task :
|
14
|
-
|
15
|
+
# test (default)
|
16
|
+
task default: :test
|
17
|
+
|
18
|
+
Rake::TestTask.new do
|
19
|
+
_1.libs << 'test'
|
20
|
+
_1.warning = false # sterile has a few issues here
|
15
21
|
end
|
16
22
|
|
17
|
-
|
18
|
-
|
23
|
+
# Watch rb files, run tests whenever something changes
|
24
|
+
task :watch do
|
25
|
+
# https://superuser.com/a/665208 / https://unix.stackexchange.com/a/42288
|
26
|
+
system("while true; do find . -name '*.rb' | entr -c -d rake; test $? -gt 128 && break; done")
|
19
27
|
end
|
20
28
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
29
|
+
#
|
30
|
+
# rubocop
|
31
|
+
#
|
32
|
+
|
33
|
+
task :rubocop do
|
34
|
+
system('bundle exec rubocop -A .', exception: true)
|
25
35
|
end
|
26
36
|
|
27
37
|
#
|
28
|
-
#
|
38
|
+
# gem
|
29
39
|
#
|
30
40
|
|
31
|
-
|
32
|
-
|
41
|
+
task :build do
|
42
|
+
system 'gem build --quiet sinew.gemspec', exception: true
|
33
43
|
end
|
34
44
|
|
35
|
-
task
|
45
|
+
task install: :build do
|
46
|
+
system "gem install --quiet sinew-#{spec.version}.gem", exception: true
|
47
|
+
end
|
36
48
|
|
37
|
-
|
38
|
-
|
49
|
+
task release: %i[rubocop test build] do
|
50
|
+
system "git tag -a #{spec.version} -m 'Tagging #{spec.version}'", exception: true
|
51
|
+
system 'git push --tags', exception: true
|
52
|
+
system "gem push sinew-#{spec.version}.gem", exception: true
|
53
|
+
end
|
data/bin/sinew
CHANGED
@@ -11,11 +11,15 @@ require 'slop'
|
|
11
11
|
|
12
12
|
options = Slop.parse do |o|
|
13
13
|
o.banner = 'Usage: sinew [options] <gub.sinew>'
|
14
|
-
o.bool '-v', '--verbose', 'dump
|
15
|
-
o.bool '--version', 'show version'
|
14
|
+
o.bool '-v', '--verbose', 'dump emitted rows while running'
|
16
15
|
o.bool '-q', '--quiet', 'suppress some output'
|
17
|
-
o.
|
18
|
-
o.
|
16
|
+
o.integer '-l', '--limit', 'quit after emitting this many rows'
|
17
|
+
o.string '-c', '--cache', 'set custom cache directory', default: "#{ENV['HOME']}/.sinew"
|
18
|
+
o.bool '--force', "don't read anything from cache (but still write)"
|
19
|
+
o.bool '--force-errors', "don't read errors from cache (but still write)"
|
20
|
+
o.string '--proxy', 'use host[:port] as HTTP proxy'
|
21
|
+
o.bool '--version', 'show version and exit'
|
22
|
+
o.on('--help', 'show this help') do
|
19
23
|
puts o
|
20
24
|
exit
|
21
25
|
end
|
data/lib/sinew.rb
CHANGED
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'faraday'
|
2
|
+
require 'faraday-encoding'
|
3
|
+
require 'faraday/logging/formatter'
|
4
|
+
require 'httpdisk'
|
5
|
+
require 'sinew/connection/log_formatter'
|
6
|
+
require 'sinew/connection/rate_limit'
|
7
|
+
|
8
|
+
module Sinew
|
9
|
+
module Connection
|
10
|
+
def self.create(options:, runtime_options:)
|
11
|
+
connection_options = {}
|
12
|
+
connection_options[:ssl] = { verify: false } if runtime_options.insecure
|
13
|
+
|
14
|
+
Faraday.new(nil, connection_options) do
|
15
|
+
_1.use RateLimit, rate_limit: runtime_options.rate_limit
|
16
|
+
|
17
|
+
# auto-encode form bodies
|
18
|
+
_1.request :url_encoded
|
19
|
+
|
20
|
+
# Before httpdisk so each redirect segment is cached
|
21
|
+
# Keep track of redirect status for logger
|
22
|
+
_1.response :follow_redirects, callback: ->(_old_env, new_env) { new_env[:redirect] = true }
|
23
|
+
|
24
|
+
# set Ruby string encoding based on Content-Type (should be above httpdisk)
|
25
|
+
_1.response :encoding
|
26
|
+
|
27
|
+
# disk caching
|
28
|
+
httpdisk_options = {
|
29
|
+
dir: options[:cache],
|
30
|
+
force: options[:force],
|
31
|
+
force_errors: options[:force_errors],
|
32
|
+
}.merge(runtime_options.httpdisk_options)
|
33
|
+
|
34
|
+
_1.use :httpdisk, httpdisk_options
|
35
|
+
|
36
|
+
# After httpdisk so that only non-cached requests are logged.
|
37
|
+
# Before retry so that we don't log each retry attempt.
|
38
|
+
_1.response :logger, nil, formatter: LogFormatter if !options[:quiet]
|
39
|
+
|
40
|
+
# After httpdisk so transient failures are not cached
|
41
|
+
retry_options = {
|
42
|
+
interval: runtime_options.rate_limit,
|
43
|
+
max: runtime_options.retries,
|
44
|
+
methods: %w[delete get head options patch post put trace],
|
45
|
+
retry_statuses: (500..600).to_a,
|
46
|
+
retry_if: ->(_env, _err) { true },
|
47
|
+
}
|
48
|
+
_1.request :retry, retry_options
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|