sinew 2.0.1 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fde4bbaa95fce45f3a7ae7aeacab1672615ea1ace852845b0395ce9cce32f861
4
- data.tar.gz: 5743800570722443f704c5fc7bc421346cc4f2fb116b8fe9f615bf84fb95f826
3
+ metadata.gz: df55f2168ff4242fceb31d083b8d16f1046139fa7acb8a9c4fc3f06f7884e113
4
+ data.tar.gz: 520967eba4ea2d8446690736f2c28d34642b452c0f4e5003dcb89ce373c116e5
5
5
  SHA512:
6
- metadata.gz: 94009061e7f4e36cc23528be3866c6a372df51a83e096144cafbd923259439e6d44a7d656fbdcfe09c2e059b48deb553caca3ec5d332b33845afd1e91550371a
7
- data.tar.gz: 5a8baf7fbdba371065c796c9fdce4312039558b27b4a16676b3df16d5138916ce84db0677dce6ede1831be8040df9112a0491e421813af5e5fd0b0b747d49239
6
+ metadata.gz: 7443bccc5fc4e1bd112ce50b3d17445f0c21f5b351a6b5be586aadd63f36396312370ec6115d8116701165b3af19fcb852f85d18d0fbe7b4bf0d797312d3fa40
7
+ data.tar.gz: 9ca4f3c424e021100f518ca4f2231f515b38dbeb402ff4fce07c13a2440f19b0fabcc2a2920e51aa3f6228507550d3c94cb70a46fd22711ba3add90e4fc28004
@@ -0,0 +1,26 @@
1
+ name: test
2
+
3
+ on:
4
+ push:
5
+ paths-ignore:
6
+ - '**.md'
7
+ pull_request:
8
+ paths-ignore:
9
+ - '**.md'
10
+ workflow_dispatch:
11
+
12
+ jobs:
13
+ test:
14
+ strategy:
15
+ max-parallel: 3
16
+ matrix:
17
+ os: [ubuntu, macos]
18
+ ruby-version: [3.0, 2.7]
19
+ runs-on: ${{ matrix.os }}-latest
20
+ steps:
21
+ - uses: actions/checkout@v2
22
+ - uses: ruby/setup-ruby@v1
23
+ with:
24
+ ruby-version: ${{ matrix.ruby-version }}
25
+ - run: bundle install
26
+ - run: bundle exec rake test
data/.rubocop.yml CHANGED
@@ -1,6 +1,6 @@
1
1
  AllCops:
2
- Exclude:
3
- TargetRubyVersion: 2.3
2
+ TargetRubyVersion: 2.7
3
+ NewCops: enable
4
4
 
5
5
  # amd: customizations
6
6
  Layout/SpaceInsideArrayLiteralBrackets:
@@ -22,18 +22,19 @@ Style/TrailingCommaInHashLiteral:
22
22
 
23
23
  # amd: these seem extreme
24
24
  Lint/AssignmentInCondition: { Enabled: false } # I do this all the time
25
- Lint/HandleExceptions: { Enabled: false } # blank rescues are useful
25
+ Lint/SuppressedException: { Enabled: false } # blank rescues are useful
26
26
  Naming/BinaryOperatorParameterName: { Enabled: false } # silly
27
27
  Naming/HeredocDelimiterNaming: { Enabled: false } # silly
28
- Naming/UncommunicativeMethodParamName: { Enabled: false } # silly
29
- Performance/RegexpMatch: { Enabled: false } # =~ is fine
30
- Performance/TimesMap: { Enabled: false } # silly
28
+ Naming/MethodParameterName: { Enabled: false } # silly
29
+ Style/AccessorGrouping: { Enabled: false } # silly
30
+ Style/AsciiComments: { Enabled: false } # silly
31
31
  Style/ClassAndModuleChildren: { Enabled: false } # silly
32
32
  Style/Documentation: { Enabled: false } # we don't need this
33
33
  Style/DoubleNegation: { Enabled: false } # silly
34
34
  Style/FormatStringToken: { Enabled: false } # we like printf here
35
35
  Style/FrozenStringLiteralComment: { Enabled: false } # seems excessive
36
36
  Style/GuardClause: { Enabled: false } # confusing
37
+ Style/HashTransformValues: { Enabled: false } # breaks code by trying to apply to an array
37
38
  Style/IfUnlessModifier: { Enabled: false } # personally I hate unless
38
39
  Style/NegatedIf: { Enabled: false } # these are fine
39
40
  Style/Next: { Enabled: false } # these are fine
@@ -41,7 +42,9 @@ Style/NumericPredicate: { Enabled: false } # silly
41
42
  Style/ParallelAssignment: { Enabled: false } # these are fine
42
43
  Style/PerlBackrefs: { Enabled: false } # these are fine
43
44
  Style/RaiseArgs: { Enabled: false } # silly
45
+ Style/RedundantAssignment: { Enabled: false } # these are usually on purpose
44
46
  Style/RegexpLiteral: { Enabled: false } # these are fine
47
+ Style/SoleNestedConditional: { Enabled: false } # these are fine
45
48
  Style/StderrPuts: { Enabled: false } # this is awful
46
49
 
47
50
  # amd: these Metric rules are annoying, disable
@@ -1,15 +1,5 @@
1
1
  {
2
- "editor.formatOnSave": true,
3
- "editor.formatOnSaveTimeout": 1500,
4
- "editor.tabSize": 2,
5
- "editor.wordSeparators": "`~#$%^&*()-=+[{]}\\|;:'\",.<>/",
6
2
  "files.associations": {
7
3
  "*.sinew": "ruby"
8
- },
9
- "files.insertFinalNewline": true,
10
- "files.trimTrailingWhitespace": true,
11
- "ruby.format": "rubocop",
12
- "ruby.lint": {
13
- "rubocop": true
14
4
  }
15
5
  }
data/Gemfile CHANGED
@@ -1,2 +1,11 @@
1
1
  source 'http://rubygems.org'
2
+
3
+ group :development do
4
+ gem 'minitest'
5
+ gem 'mocha'
6
+ gem 'rake'
7
+ gem 'rubocop', '~> 0.91.0', require: false
8
+ gem 'webmock'
9
+ end
10
+
2
11
  gemspec
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2012 Adam Doppelt
1
+ Copyright (c) 2012-2018 Adam Doppelt
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining
4
4
  a copy of this software and associated documentation files (the
data/README.md CHANGED
@@ -1,11 +1,13 @@
1
+ [![Build Status](https://github.com/gurgeous/sinew/workflows/test/badge.svg?branch=master)](https://github.com/gurgeous/sinew/action)
2
+
1
3
  ## Welcome to Sinew
2
4
 
3
5
  Sinew collects structured data from web sites (screen scraping). It provides a Ruby DSL built for crawling, a robust caching system, and integration with [Nokogiri](http://nokogiri.org). Though small, this project is the culmination of years of effort based on crawling systems built at several different companies.
4
6
 
5
7
  Sinew is distributed as a ruby gem:
6
8
 
7
- ```ruby
8
- gem install sinew
9
+ ```sh
10
+ $ gem install sinew
9
11
  ```
10
12
 
11
13
  or in your Gemfile:
@@ -16,39 +18,32 @@ gem 'sinew'
16
18
 
17
19
  ## Table of Contents
18
20
 
19
- <!---
20
- markdown-toc --no-firsth1 --maxdepth 1 readme.md
21
- -->
21
+ <!--- markdown-toc --no-firsth1 --maxdepth 1 readme.md -->
22
22
 
23
- * [Sinew 2 (May 2018)](#sinew-2-may-2018)
24
- * [Quick Example](#quick-example)
25
- * [How it Works](#how-it-works)
26
- * [DSL Reference](#dsl-reference)
27
- * [Hints](#hints)
28
- * [Limitations](#limitations)
29
- * [Changelog](#changelog)
23
+ - [Sinew 3](#sinew-3-may-2021)
24
+ - [Quick Example](#quick-example)
25
+ - [How it Works](#how-it-works)
26
+ - [DSL Reference](#dsl-reference)
27
+ - [Hints](#hints)
28
+ - [Limitations](#limitations)
29
+ - [Changelog](#changelog)
30
+ - [License](#license)
30
31
 
31
- ## Sinew 2 (May 2018)
32
+ ## Sinew 3 (May 2021)
32
33
 
33
- I am pleased to announce the release of Sinew 2.0, a complete rewrite of Sinew for the modern era. Enhancements include:
34
-
35
- * Remove dependencies on active_support, curl and tidy. We use HTTParty now.
36
- * Much easier to customize requests in `.sinew` files. For example, setting User-Agent or Bearer tokens.
37
- * More operations like `post_json` or the generic `http`. These methods are thing wrappers around HTTParty.
38
- * New end-of-run report.
39
- * Tests, rubocop, vscode settings, travis, etc.
34
+ I am pleased to announce the release of Sinew 3.0. Sinew has been streamlined and updated to use the [Faraday](https://lostisland.github.io/faraday/) HTTP client with [sinew](https://github.com/gurgeous/sinew/) middleware for caching.
40
35
 
41
36
  **Breaking change**
42
37
 
43
- Sinew uses a new format for cached responses. Old Sinew 1 cache directories must be removed before running Sinew again. Sinew 2 might choke on Sinew 1 cache directores when reading `head/`. This is not tested or supported.
38
+ Sinew 3 uses a new format for cached responses. Old Sinew 2 cache directories should be removed before running Sinew again.
44
39
 
45
40
  ## Quick Example
46
41
 
47
- Here's an example for collecting the links from httpbin.org:
42
+ Here's an example for collecting the links from httpbingo.org:
48
43
 
49
44
  ```ruby
50
45
  # get the url
51
- get "http://httpbin.org"
46
+ get "http://httpbingo.org"
52
47
 
53
48
  # use nokogiri to collect links
54
49
  noko.css("ul li a").each do |a|
@@ -113,9 +108,9 @@ Sinew creates a CSV file with the same name as the recipe, and `csv_emit(hash)`
113
108
 
114
109
  #### Caching
115
110
 
116
- Requests are made using HTTParty, and all responses are cached on disk in `~/.sinew`. Error responses are cached as well. Each URL will be hit exactly once, and requests are rate limited to one per second. Sinew tries to be polite.
111
+ Sinew uses [sinew](https://github.com/gurgeous/sinew/) to aggressively cache all HTTP responses to disk in `~/.sinew`. Error responses are cached as well. Each URL will be hit exactly once, and requests are rate limited to one per second. Sinew tries to be polite.
117
112
 
118
- The files in `~/.sinew` have nice names and are designed to be human readable. This helps when writing recipes. Sinew never deletes files from the cache - that's up to you!
113
+ Sinew never deletes files from the cache - that's up to you!
119
114
 
120
115
  Because all requests are cached, you can run Sinew repeatedly with confidence. Run it over and over again while you build up your recipe.
121
116
 
@@ -123,65 +118,89 @@ Because all requests are cached, you can run Sinew repeatedly with confidence. R
123
118
 
124
119
  #### Making requests
125
120
 
126
- * `get(url, query = {})` - fetch a url with HTTP GET. URL parameters can be added using `query.
127
- * `post(url, form = {})` - fetch a url with HTTP POST, using `form` as the POST body.
128
- * `post_json(url, json = {})` - fetch a url with HTTP POST, using `json` as the POST body.
129
- * `http(method, url, options = {})` - use this for more complex requests
121
+ - `get(url, query = {})` - fetch a url with HTTP GET. URL parameters can be added using `query.
122
+ - `post(url, form = {})` - fetch a url with HTTP POST, using `form` as the URL encoded POST body.
123
+ - `post_json(url, json = {})` - fetch a url with HTTP POST, using `json` as the POST body.
124
+ - `http(method, url, options = {})` - use this for more complex requests
130
125
 
131
126
  #### Parsing the response
132
127
 
133
- * `raw` - the raw response from the last request
134
- * `html` - like `raw`, but with a handful of HTML-specific whitespace cleanups
135
- * `noko` - a [Nokogiri](http://nokogiri.org) document built from the tidied HTML
136
- * `json` - parse the response as JSON, with symbolized keys
137
- * `url` - the url of the last request. If the request goes through a redirect, `url` will reflect the final url.
138
- * `uri` - the URI of the last request. This is useful for resolving relative URLs.
128
+ These variables are set after each HTTP request.
129
+
130
+ - `raw` - the raw response from the last request
131
+ - `html` - like `raw`, but with a handful of HTML-specific whitespace cleanups
132
+ - `noko` - parse the response as HTML and return a [Nokogiri](http://nokogiri.org) document
133
+ - `xml` - parse the response as XML and return a [Nokogiri](http://nokogiri.org) document
134
+ - `json` - parse the response as JSON, with symbolized keys
135
+ - `url` - the url of the last request. If the request goes through a redirect, `url` will reflect the final url.
136
+ - `uri` - the URI of the last request. This is useful for resolving relative URLs.
139
137
 
140
138
  #### Writing CSV
141
139
 
142
- * `csv_header(keys)` - specify the columns for CSV output. If you don't call this, Sinew will use the keys from the first call to `csv_emit`.
143
- * `csv_emit(hash)` - append a row to the CSV file
140
+ - `csv_header(keys)` - specify the columns for CSV output. If you don't call this, Sinew will use the keys from the first call to `csv_emit`.
141
+ - `csv_emit(hash)` - append a row to the CSV file
144
142
 
145
143
  ## Hints
146
144
 
147
145
  Writing Sinew recipes is fun and easy. The builtin caching means you can iterate quickly, since you won't have to re-fetch the data. Here are some hints for writing idiomatic recipes:
148
146
 
149
- * Sinew doesn't (yet) check robots.txt - please check it manually.
150
- * Prefer Nokogiri over regular expressions wherever possible. Learn [CSS selectors](http://www.w3schools.com/cssref/css_selectors.asp).
151
- * In Chrome, `$` in the console is your friend.
152
- * Fallback to regular expressions if you're desperate. Depending on the site, use either `raw` or `html`. `html` is probably your best bet. `raw` is good for crawling Javascript, but it's fragile if the site changes.
153
- * Learn to love `String#[regexp]`, which is an obscure operator but incredibly handy for Sinew.
154
- * Laziness is useful. Keep your CSS selectors and regular expressions simple, so maybe they'll work again the next time you need to crawl a site.
155
- * Don't be afraid to mix CSS selectors, regular expressions, and Ruby:
147
+ - Sinew doesn't (yet) check robots.txt - please check it manually.
148
+ - Prefer Nokogiri over regular expressions wherever possible. Learn [CSS selectors](http://www.w3schools.com/cssref/css_selectors.asp).
149
+ - In Chrome, `$` in the console is your friend.
150
+ - Fallback to regular expressions if you're desperate. Depending on the site, use either `raw` or `html`. `html` is probably your best bet. `raw` is good for crawling Javascript, but it's fragile if the site changes.
151
+ - Learn to love `String#[regexp]`, which is an obscure operator but incredibly handy for Sinew.
152
+ - Laziness is useful. Keep your CSS selectors and regular expressions simple, so maybe they'll work again the next time you need to crawl a site.
153
+ - Don't be afraid to mix CSS selectors, regular expressions, and Ruby:
156
154
 
157
155
  ```ruby
158
156
  noko.css("table")[4].css("td").select { |i| i[:width].to_i > 80 }.map(&:text)
159
157
  ```
160
158
 
161
- * Debug your recipes using plain old `puts`, or better yet use `ap` from [awesome_print](https://github.com/michaeldv/awesome_print).
162
- * Run `sinew -v` to get a report on every `csv_emit`. Very handy.
163
- * Add the CSV files to your git repo. That way you can version them and get diffs!
159
+ - Debug your recipes using plain old `puts`, or better yet use `ap` from [amazing_print](https://github.com/amazing-print/amazing_print).
160
+ - Run `sinew -v` to get a report on every `csv_emit`. Very handy.
161
+ - Add the CSV files to your git repo. That way you can version them and get diffs!
164
162
 
165
163
  ## Limitations
166
164
 
167
- * Caching is based on URL, so use caution with cookies and other forms of authentication
168
- * Almost no support for international (non-english) characters
165
+ - Caching is based on URL, so use caution with cookies and other forms of authentication
166
+ - Almost no support for international (non-english) characters
169
167
 
170
168
  ## Changelog
171
169
 
172
- #### 2.0.0 (May 2018)
170
+ #### 3.0.0 (May 2021)
171
+
172
+ - Major rewrite of network and caching layer. See above.
173
+ - Use Faraday HTTP client with sinew middleware for caching.
174
+ - Supports multiple proxies (`--proxy host1,host2,...`)
175
+
176
+ #### 2.0.4 (May 2018)
173
177
 
174
- * Complete rewrite. See above.
178
+ - Handle and cache more errors (too many redirects, connection failures, etc.)
179
+ - Support for adding uri.scheme in generate_cache_key
180
+ - Added status `code`, a peer to `uri`, `raw`, etc.
175
181
 
176
- #### 1.0.3
182
+ #### 2.0.3 (May 2018)
183
+
184
+ - &amp; now normalizes to & (not and)
185
+
186
+ #### 2.0.2 (May 2018)
187
+
188
+ - Support for `--limit`, `--proxy` and the `xml` variable
189
+ - Dedup - warn and ignore if row[:url] has already been emitted
190
+ - Auto gunzip if contents are compressed
191
+
192
+ #### 2.0.1 (May 2018)
193
+
194
+ - Support for legacy cached `head` files from Sinew 1
195
+
196
+ #### 2.0.0 (May 2018)
177
197
 
178
- * Friendlier message if curl or tidy are missing.
198
+ - Complete rewrite. See above.
179
199
 
180
- #### 1.0.2
200
+ #### 1.0.3 (June 2012)
181
201
 
182
- * Remove entity options from tidy, which didn't work on MacOS (thanks Rex!)
202
+ ...
183
203
 
184
- #### 1.0.1
204
+ ## License
185
205
 
186
- * Trying to run on 1.8 produces a fatal error. Onward!
187
- * Added first batch of unit tests
206
+ This extension is [licensed under the MIT License](LICENSE).
data/Rakefile CHANGED
@@ -1,38 +1,53 @@
1
- require 'bundler'
2
1
  require 'bundler/setup'
3
2
 
4
- require 'rake'
5
3
  require 'rake/testtask'
6
4
  require 'sinew/version'
7
5
 
6
+ # load the spec, we use it below
7
+ spec = Gem::Specification.load('sinew.gemspec')
8
+
8
9
  #
9
- # gem
10
+ # testing
11
+ # don't forget about TESTOPTS="--verbose" rake
12
+ # also: rake install && rm -rf ~/.sinew/www.amazon.com && /usr/local/bin/sinew sample.sinew
10
13
  #
11
14
 
12
- task gem: :build
13
- task :build do
14
- system 'gem build --quiet sinew.gemspec'
15
+ # test (default)
16
+ task default: :test
17
+
18
+ Rake::TestTask.new do
19
+ _1.libs << 'test'
20
+ _1.warning = false # sterile has a few issues here
15
21
  end
16
22
 
17
- task install: :build do
18
- system "sudo gem install --quiet sinew-#{Sinew::VERSION}.gem"
23
+ # Watch rb files, run tests whenever something changes
24
+ task :watch do
25
+ # https://superuser.com/a/665208 / https://unix.stackexchange.com/a/42288
26
+ system("while true; do find . -name '*.rb' | entr -c -d rake; test $? -gt 128 && break; done")
19
27
  end
20
28
 
21
- task release: :build do
22
- system "git tag -a #{Sinew::VERSION} -m 'Tagging #{Sinew::VERSION}'"
23
- system 'git push --tags'
24
- system "gem push sinew-#{Sinew::VERSION}.gem"
29
+ #
30
+ # rubocop
31
+ #
32
+
33
+ task :rubocop do
34
+ system('bundle exec rubocop -A .', exception: true)
25
35
  end
26
36
 
27
37
  #
28
- # minitest
38
+ # gem
29
39
  #
30
40
 
31
- Rake::TestTask.new(:test) do |t|
32
- t.warning = false
41
+ task :build do
42
+ system 'gem build --quiet sinew.gemspec', exception: true
33
43
  end
34
44
 
35
- task default: :test
45
+ task install: :build do
46
+ system "gem install --quiet sinew-#{spec.version}.gem", exception: true
47
+ end
36
48
 
37
- # to test:
38
- # block ; rake install && rm -rf ~/.sinew/www.amazon.com && /usr/local/bin/sinew sample.sinew
49
+ task release: %i[rubocop test build] do
50
+ system "git tag -a #{spec.version} -m 'Tagging #{spec.version}'", exception: true
51
+ system 'git push --tags', exception: true
52
+ system "gem push sinew-#{spec.version}.gem", exception: true
53
+ end
data/bin/sinew CHANGED
@@ -11,11 +11,15 @@ require 'slop'
11
11
 
12
12
  options = Slop.parse do |o|
13
13
  o.banner = 'Usage: sinew [options] <gub.sinew>'
14
- o.bool '-v', '--verbose', 'dump every row'
15
- o.bool '--version', 'show version'
14
+ o.bool '-v', '--verbose', 'dump emitted rows while running'
16
15
  o.bool '-q', '--quiet', 'suppress some output'
17
- o.string '--cache', 'Set the cache directory (defaults to ~/.sinew)', default: "#{ENV['HOME']}/.sinew"
18
- o.on '--help' do
16
+ o.integer '-l', '--limit', 'quit after emitting this many rows'
17
+ o.string '-c', '--cache', 'set custom cache directory', default: "#{ENV['HOME']}/.sinew"
18
+ o.bool '--force', "don't read anything from cache (but still write)"
19
+ o.bool '--force-errors', "don't read errors from cache (but still write)"
20
+ o.string '--proxy', 'use host[:port] as HTTP proxy'
21
+ o.bool '--version', 'show version and exit'
22
+ o.on('--help', 'show this help') do
19
23
  puts o
20
24
  exit
21
25
  end
data/lib/sinew.rb CHANGED
@@ -1,4 +1,3 @@
1
- require_relative 'sinew/cache'
2
1
  require_relative 'sinew/core_ext'
3
2
  require_relative 'sinew/dsl'
4
3
  require_relative 'sinew/main'
@@ -0,0 +1,52 @@
1
+ require 'faraday'
2
+ require 'faraday-encoding'
3
+ require 'faraday/logging/formatter'
4
+ require 'httpdisk'
5
+ require 'sinew/connection/log_formatter'
6
+ require 'sinew/connection/rate_limit'
7
+
8
+ module Sinew
9
+ module Connection
10
+ def self.create(options:, runtime_options:)
11
+ connection_options = {}
12
+ connection_options[:ssl] = { verify: false } if runtime_options.insecure
13
+
14
+ Faraday.new(nil, connection_options) do
15
+ _1.use RateLimit, rate_limit: runtime_options.rate_limit
16
+
17
+ # auto-encode form bodies
18
+ _1.request :url_encoded
19
+
20
+ # Before httpdisk so each redirect segment is cached
21
+ # Keep track of redirect status for logger
22
+ _1.response :follow_redirects, callback: ->(_old_env, new_env) { new_env[:redirect] = true }
23
+
24
+ # set Ruby string encoding based on Content-Type (should be above httpdisk)
25
+ _1.response :encoding
26
+
27
+ # disk caching
28
+ httpdisk_options = {
29
+ dir: options[:cache],
30
+ force: options[:force],
31
+ force_errors: options[:force_errors],
32
+ }.merge(runtime_options.httpdisk_options)
33
+
34
+ _1.use :httpdisk, httpdisk_options
35
+
36
+ # After httpdisk so that only non-cached requests are logged.
37
+ # Before retry so that we don't log each retry attempt.
38
+ _1.response :logger, nil, formatter: LogFormatter if !options[:quiet]
39
+
40
+ # After httpdisk so transient failures are not cached
41
+ retry_options = {
42
+ interval: runtime_options.rate_limit,
43
+ max: runtime_options.retries,
44
+ methods: %w[delete get head options patch post put trace],
45
+ retry_statuses: (500..600).to_a,
46
+ retry_if: ->(_env, _err) { true },
47
+ }
48
+ _1.request :retry, retry_options
49
+ end
50
+ end
51
+ end
52
+ end