sinew 2.0.1 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fde4bbaa95fce45f3a7ae7aeacab1672615ea1ace852845b0395ce9cce32f861
4
- data.tar.gz: 5743800570722443f704c5fc7bc421346cc4f2fb116b8fe9f615bf84fb95f826
3
+ metadata.gz: df55f2168ff4242fceb31d083b8d16f1046139fa7acb8a9c4fc3f06f7884e113
4
+ data.tar.gz: 520967eba4ea2d8446690736f2c28d34642b452c0f4e5003dcb89ce373c116e5
5
5
  SHA512:
6
- metadata.gz: 94009061e7f4e36cc23528be3866c6a372df51a83e096144cafbd923259439e6d44a7d656fbdcfe09c2e059b48deb553caca3ec5d332b33845afd1e91550371a
7
- data.tar.gz: 5a8baf7fbdba371065c796c9fdce4312039558b27b4a16676b3df16d5138916ce84db0677dce6ede1831be8040df9112a0491e421813af5e5fd0b0b747d49239
6
+ metadata.gz: 7443bccc5fc4e1bd112ce50b3d17445f0c21f5b351a6b5be586aadd63f36396312370ec6115d8116701165b3af19fcb852f85d18d0fbe7b4bf0d797312d3fa40
7
+ data.tar.gz: 9ca4f3c424e021100f518ca4f2231f515b38dbeb402ff4fce07c13a2440f19b0fabcc2a2920e51aa3f6228507550d3c94cb70a46fd22711ba3add90e4fc28004
@@ -0,0 +1,26 @@
1
+ name: test
2
+
3
+ on:
4
+ push:
5
+ paths-ignore:
6
+ - '**.md'
7
+ pull_request:
8
+ paths-ignore:
9
+ - '**.md'
10
+ workflow_dispatch:
11
+
12
+ jobs:
13
+ test:
14
+ strategy:
15
+ max-parallel: 3
16
+ matrix:
17
+ os: [ubuntu, macos]
18
+ ruby-version: [3.0, 2.7]
19
+ runs-on: ${{ matrix.os }}-latest
20
+ steps:
21
+ - uses: actions/checkout@v2
22
+ - uses: ruby/setup-ruby@v1
23
+ with:
24
+ ruby-version: ${{ matrix.ruby-version }}
25
+ - run: bundle install
26
+ - run: bundle exec rake test
data/.rubocop.yml CHANGED
@@ -1,6 +1,6 @@
1
1
  AllCops:
2
- Exclude:
3
- TargetRubyVersion: 2.3
2
+ TargetRubyVersion: 2.7
3
+ NewCops: enable
4
4
 
5
5
  # amd: customizations
6
6
  Layout/SpaceInsideArrayLiteralBrackets:
@@ -22,18 +22,19 @@ Style/TrailingCommaInHashLiteral:
22
22
 
23
23
  # amd: these seem extreme
24
24
  Lint/AssignmentInCondition: { Enabled: false } # I do this all the time
25
- Lint/HandleExceptions: { Enabled: false } # blank rescues are useful
25
+ Lint/SuppressedException: { Enabled: false } # blank rescues are useful
26
26
  Naming/BinaryOperatorParameterName: { Enabled: false } # silly
27
27
  Naming/HeredocDelimiterNaming: { Enabled: false } # silly
28
- Naming/UncommunicativeMethodParamName: { Enabled: false } # silly
29
- Performance/RegexpMatch: { Enabled: false } # =~ is fine
30
- Performance/TimesMap: { Enabled: false } # silly
28
+ Naming/MethodParameterName: { Enabled: false } # silly
29
+ Style/AccessorGrouping: { Enabled: false } # silly
30
+ Style/AsciiComments: { Enabled: false } # silly
31
31
  Style/ClassAndModuleChildren: { Enabled: false } # silly
32
32
  Style/Documentation: { Enabled: false } # we don't need this
33
33
  Style/DoubleNegation: { Enabled: false } # silly
34
34
  Style/FormatStringToken: { Enabled: false } # we like printf here
35
35
  Style/FrozenStringLiteralComment: { Enabled: false } # seems excessive
36
36
  Style/GuardClause: { Enabled: false } # confusing
37
+ Style/HashTransformValues: { Enabled: false } # breaks code by trying to apply to an array
37
38
  Style/IfUnlessModifier: { Enabled: false } # personally I hate unless
38
39
  Style/NegatedIf: { Enabled: false } # these are fine
39
40
  Style/Next: { Enabled: false } # these are fine
@@ -41,7 +42,9 @@ Style/NumericPredicate: { Enabled: false } # silly
41
42
  Style/ParallelAssignment: { Enabled: false } # these are fine
42
43
  Style/PerlBackrefs: { Enabled: false } # these are fine
43
44
  Style/RaiseArgs: { Enabled: false } # silly
45
+ Style/RedundantAssignment: { Enabled: false } # these are usually on purpose
44
46
  Style/RegexpLiteral: { Enabled: false } # these are fine
47
+ Style/SoleNestedConditional: { Enabled: false } # these are fine
45
48
  Style/StderrPuts: { Enabled: false } # this is awful
46
49
 
47
50
  # amd: these Metric rules are annoying, disable
@@ -1,15 +1,5 @@
1
1
  {
2
- "editor.formatOnSave": true,
3
- "editor.formatOnSaveTimeout": 1500,
4
- "editor.tabSize": 2,
5
- "editor.wordSeparators": "`~#$%^&*()-=+[{]}\\|;:'\",.<>/",
6
2
  "files.associations": {
7
3
  "*.sinew": "ruby"
8
- },
9
- "files.insertFinalNewline": true,
10
- "files.trimTrailingWhitespace": true,
11
- "ruby.format": "rubocop",
12
- "ruby.lint": {
13
- "rubocop": true
14
4
  }
15
5
  }
data/Gemfile CHANGED
@@ -1,2 +1,11 @@
1
1
  source 'http://rubygems.org'
2
+
3
+ group :development do
4
+ gem 'minitest'
5
+ gem 'mocha'
6
+ gem 'rake'
7
+ gem 'rubocop', '~> 0.91.0', require: false
8
+ gem 'webmock'
9
+ end
10
+
2
11
  gemspec
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2012 Adam Doppelt
1
+ Copyright (c) 2012-2018 Adam Doppelt
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining
4
4
  a copy of this software and associated documentation files (the
data/README.md CHANGED
@@ -1,11 +1,13 @@
1
+ [![Build Status](https://github.com/gurgeous/sinew/workflows/test/badge.svg?branch=master)](https://github.com/gurgeous/sinew/action)
2
+
1
3
  ## Welcome to Sinew
2
4
 
3
5
  Sinew collects structured data from web sites (screen scraping). It provides a Ruby DSL built for crawling, a robust caching system, and integration with [Nokogiri](http://nokogiri.org). Though small, this project is the culmination of years of effort based on crawling systems built at several different companies.
4
6
 
5
7
  Sinew is distributed as a ruby gem:
6
8
 
7
- ```ruby
8
- gem install sinew
9
+ ```sh
10
+ $ gem install sinew
9
11
  ```
10
12
 
11
13
  or in your Gemfile:
@@ -16,39 +18,32 @@ gem 'sinew'
16
18
 
17
19
  ## Table of Contents
18
20
 
19
- <!---
20
- markdown-toc --no-firsth1 --maxdepth 1 readme.md
21
- -->
21
+ <!--- markdown-toc --no-firsth1 --maxdepth 1 readme.md -->
22
22
 
23
- * [Sinew 2 (May 2018)](#sinew-2-may-2018)
24
- * [Quick Example](#quick-example)
25
- * [How it Works](#how-it-works)
26
- * [DSL Reference](#dsl-reference)
27
- * [Hints](#hints)
28
- * [Limitations](#limitations)
29
- * [Changelog](#changelog)
23
+ - [Sinew 3](#sinew-3-may-2021)
24
+ - [Quick Example](#quick-example)
25
+ - [How it Works](#how-it-works)
26
+ - [DSL Reference](#dsl-reference)
27
+ - [Hints](#hints)
28
+ - [Limitations](#limitations)
29
+ - [Changelog](#changelog)
30
+ - [License](#license)
30
31
 
31
- ## Sinew 2 (May 2018)
32
+ ## Sinew 3 (May 2021)
32
33
 
33
- I am pleased to announce the release of Sinew 2.0, a complete rewrite of Sinew for the modern era. Enhancements include:
34
-
35
- * Remove dependencies on active_support, curl and tidy. We use HTTParty now.
36
- * Much easier to customize requests in `.sinew` files. For example, setting User-Agent or Bearer tokens.
37
- * More operations like `post_json` or the generic `http`. These methods are thing wrappers around HTTParty.
38
- * New end-of-run report.
39
- * Tests, rubocop, vscode settings, travis, etc.
34
+ I am pleased to announce the release of Sinew 3.0. Sinew has been streamlined and updated to use the [Faraday](https://lostisland.github.io/faraday/) HTTP client with [sinew](https://github.com/gurgeous/sinew/) middleware for caching.
40
35
 
41
36
  **Breaking change**
42
37
 
43
- Sinew uses a new format for cached responses. Old Sinew 1 cache directories must be removed before running Sinew again. Sinew 2 might choke on Sinew 1 cache directores when reading `head/`. This is not tested or supported.
38
+ Sinew 3 uses a new format for cached responses. Old Sinew 2 cache directories should be removed before running Sinew again.
44
39
 
45
40
  ## Quick Example
46
41
 
47
- Here's an example for collecting the links from httpbin.org:
42
+ Here's an example for collecting the links from httpbingo.org:
48
43
 
49
44
  ```ruby
50
45
  # get the url
51
- get "http://httpbin.org"
46
+ get "http://httpbingo.org"
52
47
 
53
48
  # use nokogiri to collect links
54
49
  noko.css("ul li a").each do |a|
@@ -113,9 +108,9 @@ Sinew creates a CSV file with the same name as the recipe, and `csv_emit(hash)`
113
108
 
114
109
  #### Caching
115
110
 
116
- Requests are made using HTTParty, and all responses are cached on disk in `~/.sinew`. Error responses are cached as well. Each URL will be hit exactly once, and requests are rate limited to one per second. Sinew tries to be polite.
111
+ Sinew uses [sinew](https://github.com/gurgeous/sinew/) to aggressively cache all HTTP responses to disk in `~/.sinew`. Error responses are cached as well. Each URL will be hit exactly once, and requests are rate limited to one per second. Sinew tries to be polite.
117
112
 
118
- The files in `~/.sinew` have nice names and are designed to be human readable. This helps when writing recipes. Sinew never deletes files from the cache - that's up to you!
113
+ Sinew never deletes files from the cache - that's up to you!
119
114
 
120
115
  Because all requests are cached, you can run Sinew repeatedly with confidence. Run it over and over again while you build up your recipe.
121
116
 
@@ -123,65 +118,89 @@ Because all requests are cached, you can run Sinew repeatedly with confidence. R
123
118
 
124
119
  #### Making requests
125
120
 
126
- * `get(url, query = {})` - fetch a url with HTTP GET. URL parameters can be added using `query.
127
- * `post(url, form = {})` - fetch a url with HTTP POST, using `form` as the POST body.
128
- * `post_json(url, json = {})` - fetch a url with HTTP POST, using `json` as the POST body.
129
- * `http(method, url, options = {})` - use this for more complex requests
121
+ - `get(url, query = {})` - fetch a url with HTTP GET. URL parameters can be added using `query.
122
+ - `post(url, form = {})` - fetch a url with HTTP POST, using `form` as the URL encoded POST body.
123
+ - `post_json(url, json = {})` - fetch a url with HTTP POST, using `json` as the POST body.
124
+ - `http(method, url, options = {})` - use this for more complex requests
130
125
 
131
126
  #### Parsing the response
132
127
 
133
- * `raw` - the raw response from the last request
134
- * `html` - like `raw`, but with a handful of HTML-specific whitespace cleanups
135
- * `noko` - a [Nokogiri](http://nokogiri.org) document built from the tidied HTML
136
- * `json` - parse the response as JSON, with symbolized keys
137
- * `url` - the url of the last request. If the request goes through a redirect, `url` will reflect the final url.
138
- * `uri` - the URI of the last request. This is useful for resolving relative URLs.
128
+ These variables are set after each HTTP request.
129
+
130
+ - `raw` - the raw response from the last request
131
+ - `html` - like `raw`, but with a handful of HTML-specific whitespace cleanups
132
+ - `noko` - parse the response as HTML and return a [Nokogiri](http://nokogiri.org) document
133
+ - `xml` - parse the response as XML and return a [Nokogiri](http://nokogiri.org) document
134
+ - `json` - parse the response as JSON, with symbolized keys
135
+ - `url` - the url of the last request. If the request goes through a redirect, `url` will reflect the final url.
136
+ - `uri` - the URI of the last request. This is useful for resolving relative URLs.
139
137
 
140
138
  #### Writing CSV
141
139
 
142
- * `csv_header(keys)` - specify the columns for CSV output. If you don't call this, Sinew will use the keys from the first call to `csv_emit`.
143
- * `csv_emit(hash)` - append a row to the CSV file
140
+ - `csv_header(keys)` - specify the columns for CSV output. If you don't call this, Sinew will use the keys from the first call to `csv_emit`.
141
+ - `csv_emit(hash)` - append a row to the CSV file
144
142
 
145
143
  ## Hints
146
144
 
147
145
  Writing Sinew recipes is fun and easy. The builtin caching means you can iterate quickly, since you won't have to re-fetch the data. Here are some hints for writing idiomatic recipes:
148
146
 
149
- * Sinew doesn't (yet) check robots.txt - please check it manually.
150
- * Prefer Nokogiri over regular expressions wherever possible. Learn [CSS selectors](http://www.w3schools.com/cssref/css_selectors.asp).
151
- * In Chrome, `$` in the console is your friend.
152
- * Fallback to regular expressions if you're desperate. Depending on the site, use either `raw` or `html`. `html` is probably your best bet. `raw` is good for crawling Javascript, but it's fragile if the site changes.
153
- * Learn to love `String#[regexp]`, which is an obscure operator but incredibly handy for Sinew.
154
- * Laziness is useful. Keep your CSS selectors and regular expressions simple, so maybe they'll work again the next time you need to crawl a site.
155
- * Don't be afraid to mix CSS selectors, regular expressions, and Ruby:
147
+ - Sinew doesn't (yet) check robots.txt - please check it manually.
148
+ - Prefer Nokogiri over regular expressions wherever possible. Learn [CSS selectors](http://www.w3schools.com/cssref/css_selectors.asp).
149
+ - In Chrome, `$` in the console is your friend.
150
+ - Fallback to regular expressions if you're desperate. Depending on the site, use either `raw` or `html`. `html` is probably your best bet. `raw` is good for crawling Javascript, but it's fragile if the site changes.
151
+ - Learn to love `String#[regexp]`, which is an obscure operator but incredibly handy for Sinew.
152
+ - Laziness is useful. Keep your CSS selectors and regular expressions simple, so maybe they'll work again the next time you need to crawl a site.
153
+ - Don't be afraid to mix CSS selectors, regular expressions, and Ruby:
156
154
 
157
155
  ```ruby
158
156
  noko.css("table")[4].css("td").select { |i| i[:width].to_i > 80 }.map(&:text)
159
157
  ```
160
158
 
161
- * Debug your recipes using plain old `puts`, or better yet use `ap` from [awesome_print](https://github.com/michaeldv/awesome_print).
162
- * Run `sinew -v` to get a report on every `csv_emit`. Very handy.
163
- * Add the CSV files to your git repo. That way you can version them and get diffs!
159
+ - Debug your recipes using plain old `puts`, or better yet use `ap` from [amazing_print](https://github.com/amazing-print/amazing_print).
160
+ - Run `sinew -v` to get a report on every `csv_emit`. Very handy.
161
+ - Add the CSV files to your git repo. That way you can version them and get diffs!
164
162
 
165
163
  ## Limitations
166
164
 
167
- * Caching is based on URL, so use caution with cookies and other forms of authentication
168
- * Almost no support for international (non-english) characters
165
+ - Caching is based on URL, so use caution with cookies and other forms of authentication
166
+ - Almost no support for international (non-english) characters
169
167
 
170
168
  ## Changelog
171
169
 
172
- #### 2.0.0 (May 2018)
170
+ #### 3.0.0 (May 2021)
171
+
172
+ - Major rewrite of network and caching layer. See above.
173
+ - Use Faraday HTTP client with sinew middleware for caching.
174
+ - Supports multiple proxies (`--proxy host1,host2,...`)
175
+
176
+ #### 2.0.4 (May 2018)
173
177
 
174
- * Complete rewrite. See above.
178
+ - Handle and cache more errors (too many redirects, connection failures, etc.)
179
+ - Support for adding uri.scheme in generate_cache_key
180
+ - Added status `code`, a peer to `uri`, `raw`, etc.
175
181
 
176
- #### 1.0.3
182
+ #### 2.0.3 (May 2018)
183
+
184
+ - &amp; now normalizes to & (not and)
185
+
186
+ #### 2.0.2 (May 2018)
187
+
188
+ - Support for `--limit`, `--proxy` and the `xml` variable
189
+ - Dedup - warn and ignore if row[:url] has already been emitted
190
+ - Auto gunzip if contents are compressed
191
+
192
+ #### 2.0.1 (May 2018)
193
+
194
+ - Support for legacy cached `head` files from Sinew 1
195
+
196
+ #### 2.0.0 (May 2018)
177
197
 
178
- * Friendlier message if curl or tidy are missing.
198
+ - Complete rewrite. See above.
179
199
 
180
- #### 1.0.2
200
+ #### 1.0.3 (June 2012)
181
201
 
182
- * Remove entity options from tidy, which didn't work on MacOS (thanks Rex!)
202
+ ...
183
203
 
184
- #### 1.0.1
204
+ ## License
185
205
 
186
- * Trying to run on 1.8 produces a fatal error. Onward!
187
- * Added first batch of unit tests
206
+ This extension is [licensed under the MIT License](LICENSE).
data/Rakefile CHANGED
@@ -1,38 +1,53 @@
1
- require 'bundler'
2
1
  require 'bundler/setup'
3
2
 
4
- require 'rake'
5
3
  require 'rake/testtask'
6
4
  require 'sinew/version'
7
5
 
6
+ # load the spec, we use it below
7
+ spec = Gem::Specification.load('sinew.gemspec')
8
+
8
9
  #
9
- # gem
10
+ # testing
11
+ # don't forget about TESTOPTS="--verbose" rake
12
+ # also: rake install && rm -rf ~/.sinew/www.amazon.com && /usr/local/bin/sinew sample.sinew
10
13
  #
11
14
 
12
- task gem: :build
13
- task :build do
14
- system 'gem build --quiet sinew.gemspec'
15
+ # test (default)
16
+ task default: :test
17
+
18
+ Rake::TestTask.new do
19
+ _1.libs << 'test'
20
+ _1.warning = false # sterile has a few issues here
15
21
  end
16
22
 
17
- task install: :build do
18
- system "sudo gem install --quiet sinew-#{Sinew::VERSION}.gem"
23
+ # Watch rb files, run tests whenever something changes
24
+ task :watch do
25
+ # https://superuser.com/a/665208 / https://unix.stackexchange.com/a/42288
26
+ system("while true; do find . -name '*.rb' | entr -c -d rake; test $? -gt 128 && break; done")
19
27
  end
20
28
 
21
- task release: :build do
22
- system "git tag -a #{Sinew::VERSION} -m 'Tagging #{Sinew::VERSION}'"
23
- system 'git push --tags'
24
- system "gem push sinew-#{Sinew::VERSION}.gem"
29
+ #
30
+ # rubocop
31
+ #
32
+
33
+ task :rubocop do
34
+ system('bundle exec rubocop -A .', exception: true)
25
35
  end
26
36
 
27
37
  #
28
- # minitest
38
+ # gem
29
39
  #
30
40
 
31
- Rake::TestTask.new(:test) do |t|
32
- t.warning = false
41
+ task :build do
42
+ system 'gem build --quiet sinew.gemspec', exception: true
33
43
  end
34
44
 
35
- task default: :test
45
+ task install: :build do
46
+ system "gem install --quiet sinew-#{spec.version}.gem", exception: true
47
+ end
36
48
 
37
- # to test:
38
- # block ; rake install && rm -rf ~/.sinew/www.amazon.com && /usr/local/bin/sinew sample.sinew
49
+ task release: %i[rubocop test build] do
50
+ system "git tag -a #{spec.version} -m 'Tagging #{spec.version}'", exception: true
51
+ system 'git push --tags', exception: true
52
+ system "gem push sinew-#{spec.version}.gem", exception: true
53
+ end
data/bin/sinew CHANGED
@@ -11,11 +11,15 @@ require 'slop'
11
11
 
12
12
  options = Slop.parse do |o|
13
13
  o.banner = 'Usage: sinew [options] <gub.sinew>'
14
- o.bool '-v', '--verbose', 'dump every row'
15
- o.bool '--version', 'show version'
14
+ o.bool '-v', '--verbose', 'dump emitted rows while running'
16
15
  o.bool '-q', '--quiet', 'suppress some output'
17
- o.string '--cache', 'Set the cache directory (defaults to ~/.sinew)', default: "#{ENV['HOME']}/.sinew"
18
- o.on '--help' do
16
+ o.integer '-l', '--limit', 'quit after emitting this many rows'
17
+ o.string '-c', '--cache', 'set custom cache directory', default: "#{ENV['HOME']}/.sinew"
18
+ o.bool '--force', "don't read anything from cache (but still write)"
19
+ o.bool '--force-errors', "don't read errors from cache (but still write)"
20
+ o.string '--proxy', 'use host[:port] as HTTP proxy'
21
+ o.bool '--version', 'show version and exit'
22
+ o.on('--help', 'show this help') do
19
23
  puts o
20
24
  exit
21
25
  end
data/lib/sinew.rb CHANGED
@@ -1,4 +1,3 @@
1
- require_relative 'sinew/cache'
2
1
  require_relative 'sinew/core_ext'
3
2
  require_relative 'sinew/dsl'
4
3
  require_relative 'sinew/main'
@@ -0,0 +1,52 @@
1
+ require 'faraday'
2
+ require 'faraday-encoding'
3
+ require 'faraday/logging/formatter'
4
+ require 'httpdisk'
5
+ require 'sinew/connection/log_formatter'
6
+ require 'sinew/connection/rate_limit'
7
+
8
+ module Sinew
9
+ module Connection
10
+ def self.create(options:, runtime_options:)
11
+ connection_options = {}
12
+ connection_options[:ssl] = { verify: false } if runtime_options.insecure
13
+
14
+ Faraday.new(nil, connection_options) do
15
+ _1.use RateLimit, rate_limit: runtime_options.rate_limit
16
+
17
+ # auto-encode form bodies
18
+ _1.request :url_encoded
19
+
20
+ # Before httpdisk so each redirect segment is cached
21
+ # Keep track of redirect status for logger
22
+ _1.response :follow_redirects, callback: ->(_old_env, new_env) { new_env[:redirect] = true }
23
+
24
+ # set Ruby string encoding based on Content-Type (should be above httpdisk)
25
+ _1.response :encoding
26
+
27
+ # disk caching
28
+ httpdisk_options = {
29
+ dir: options[:cache],
30
+ force: options[:force],
31
+ force_errors: options[:force_errors],
32
+ }.merge(runtime_options.httpdisk_options)
33
+
34
+ _1.use :httpdisk, httpdisk_options
35
+
36
+ # After httpdisk so that only non-cached requests are logged.
37
+ # Before retry so that we don't log each retry attempt.
38
+ _1.response :logger, nil, formatter: LogFormatter if !options[:quiet]
39
+
40
+ # After httpdisk so transient failures are not cached
41
+ retry_options = {
42
+ interval: runtime_options.rate_limit,
43
+ max: runtime_options.retries,
44
+ methods: %w[delete get head options patch post put trace],
45
+ retry_statuses: (500..600).to_a,
46
+ retry_if: ->(_env, _err) { true },
47
+ }
48
+ _1.request :retry, retry_options
49
+ end
50
+ end
51
+ end
52
+ end