sinew 2.0.3 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/test.yml +26 -0
  3. data/.gitignore +3 -5
  4. data/.rubocop.yml +31 -46
  5. data/Gemfile +9 -0
  6. data/Gemfile.lock +124 -0
  7. data/README.md +146 -81
  8. data/Rakefile +36 -20
  9. data/bin/sinew +13 -39
  10. data/lib/sinew.rb +23 -10
  11. data/lib/sinew/args.rb +53 -0
  12. data/lib/sinew/base.rb +251 -0
  13. data/lib/sinew/csv.rb +89 -0
  14. data/lib/sinew/main.rb +45 -98
  15. data/lib/sinew/middleware/log_formatter.rb +23 -0
  16. data/lib/sinew/nokogiri_ext.rb +12 -21
  17. data/lib/sinew/response.rb +39 -99
  18. data/lib/sinew/version.rb +1 -1
  19. data/sample.rb +13 -0
  20. data/sample.sinew +4 -4
  21. data/sinew.gemspec +26 -25
  22. metadata +46 -108
  23. data/.travis.yml +0 -4
  24. data/.vscode/extensions.json +0 -3
  25. data/.vscode/settings.json +0 -15
  26. data/lib/sinew/cache.rb +0 -79
  27. data/lib/sinew/core_ext.rb +0 -59
  28. data/lib/sinew/dsl.rb +0 -114
  29. data/lib/sinew/output.rb +0 -149
  30. data/lib/sinew/request.rb +0 -151
  31. data/lib/sinew/runtime_options.rb +0 -28
  32. data/test/legacy/eu.httpbin.org/head/redirect,3 +0 -51
  33. data/test/legacy/eu.httpbin.org/head/status,500 +0 -1
  34. data/test/legacy/eu.httpbin.org/redirect,3 +0 -11
  35. data/test/legacy/eu.httpbin.org/status,500 +0 -1
  36. data/test/legacy/legacy.sinew +0 -2
  37. data/test/recipes/array_header.sinew +0 -6
  38. data/test/recipes/basic.sinew +0 -8
  39. data/test/recipes/dups.sinew +0 -7
  40. data/test/recipes/implicit_header.sinew +0 -5
  41. data/test/recipes/limit.sinew +0 -11
  42. data/test/recipes/noko.sinew +0 -9
  43. data/test/recipes/uri.sinew +0 -11
  44. data/test/recipes/xml.sinew +0 -8
  45. data/test/test.html +0 -45
  46. data/test/test_cache.rb +0 -69
  47. data/test/test_helper.rb +0 -123
  48. data/test/test_legacy.rb +0 -23
  49. data/test/test_main.rb +0 -34
  50. data/test/test_nokogiri_ext.rb +0 -18
  51. data/test/test_output.rb +0 -56
  52. data/test/test_recipes.rb +0 -60
  53. data/test/test_requests.rb +0 -135
  54. data/test/test_utf8.rb +0 -39
data/lib/sinew/version.rb CHANGED
@@ -1,4 +1,4 @@
1
1
  module Sinew
2
2
  # Gem version
3
- VERSION = '2.0.3'.freeze
3
+ VERSION = '4.0.0'.freeze
4
4
  end
data/sample.rb ADDED
@@ -0,0 +1,13 @@
1
+ require_relative 'lib/sinew'
2
+
3
+ sinew = Sinew.new(output: 'sample.csv', verbose: true)
4
+
5
+ response = sinew.get 'http://httpbingo.org'
6
+ response.noko.css('ul li a').each do |a|
7
+ row = {}
8
+ row[:url] = a[:href]
9
+ row[:title] = a.text
10
+ sinew.csv_emit(row)
11
+ end
12
+
13
+ sinew.get 'http://httpbingo.org/redirect/2'
data/sample.sinew CHANGED
@@ -1,9 +1,9 @@
1
- get 'http://httpbin.org'
2
- noko.css('ul li a').each do |a|
1
+ response = sinew.get 'http://httpbingo.org'
2
+ response.noko.css('ul li a').each do |a|
3
3
  row = {}
4
4
  row[:url] = a[:href]
5
5
  row[:title] = a.text
6
- csv_emit(row)
6
+ sinew.csv_emit(row)
7
7
  end
8
8
 
9
- get 'http://httpbin.org/redirect/2'
9
+ sinew.get 'http://httpbingo.org/redirect/2'
data/sinew.gemspec CHANGED
@@ -3,32 +3,33 @@ $LOAD_PATH.unshift("#{__dir__}/lib")
3
3
  require 'sinew/version'
4
4
 
5
5
  Gem::Specification.new do |s|
6
- s.name = 'sinew'
7
- s.version = Sinew::VERSION
8
- s.platform = Gem::Platform::RUBY
9
- s.license = 'MIT'
10
- s.authors = [ 'Adam Doppelt' ]
11
- s.email = [ 'amd@gurge.com' ]
12
- s.homepage = 'http://github.com/gurgeous/sinew'
13
- s.summary = 'Sinew - structured web crawling using recipes.'
14
- s.description = 'Crawl web sites easily using ruby recipes, with caching and nokogiri.'
15
- s.required_ruby_version = '~> 2.3'
6
+ s.name = 'sinew'
7
+ s.version = Sinew::VERSION
8
+ s.authors = ['Adam Doppelt', 'Nathan Kriege']
9
+ s.email = ['amd@gurge.com']
16
10
 
17
- s.rubyforge_project = 'sinew'
11
+ s.summary = 'Sinew - structured web crawling using recipes.'
12
+ s.description = 'Crawl web sites easily using ruby recipes, with caching and nokogiri.'
13
+ s.homepage = 'http://github.com/gurgeous/sinew'
14
+ s.license = 'MIT'
15
+ s.required_ruby_version = '>= 2.7'
18
16
 
19
- s.add_runtime_dependency 'awesome_print', '~> 1.8'
20
- s.add_runtime_dependency 'htmlentities', '~> 4.3'
21
- s.add_runtime_dependency 'httparty', '~> 0.16'
22
- s.add_runtime_dependency 'nokogiri', '~> 1.8'
23
- s.add_runtime_dependency 'scripto', '~> 0'
24
- s.add_runtime_dependency 'slop', '~> 4.6'
25
- s.add_runtime_dependency 'stringex', '~> 2.8'
26
- s.add_development_dependency 'minitest', '~> 5.11'
27
- s.add_development_dependency 'rake', '~> 12.3'
28
- s.add_development_dependency 'webmock', '~> 3.4'
17
+ # what's in the gem?
18
+ s.files = Dir.chdir(File.expand_path(__dir__)) do
19
+ `git ls-files -z`.split("\x0").reject { _1.match(%r{^test/}) }
20
+ end
21
+ s.bindir = 'bin'
22
+ s.executables = s.files.grep(%r{^#{s.bindir}/}) { File.basename(_1) }
23
+ s.require_paths = ['lib']
29
24
 
30
- s.files = `git ls-files`.split("\n")
31
- s.test_files = `git ls-files -- test/*`.split("\n")
32
- s.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
33
- s.require_paths = [ 'lib' ]
25
+ # gem dependencies
26
+ s.add_dependency 'amazing_print', '~> 1.3'
27
+ s.add_dependency 'faraday', '~> 1.4'
28
+ s.add_dependency 'faraday-encoding', '~> 0'
29
+ s.add_dependency 'faraday-rate_limiter', '~> 0.0'
30
+ s.add_dependency 'hashie', '~> 4.1'
31
+ s.add_dependency 'httpdisk', '~> 0.5'
32
+ s.add_dependency 'nokogiri', '~> 1.11'
33
+ s.add_dependency 'slop', '~> 4.8'
34
+ s.add_dependency 'sterile', '~> 1.0'
34
35
  end
metadata CHANGED
@@ -1,155 +1,142 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sinew
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.3
4
+ version: 4.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Adam Doppelt
8
- autorequire:
8
+ - Nathan Kriege
9
+ autorequire:
9
10
  bindir: bin
10
11
  cert_chain: []
11
- date: 2018-05-15 00:00:00.000000000 Z
12
+ date: 2021-07-09 00:00:00.000000000 Z
12
13
  dependencies:
13
14
  - !ruby/object:Gem::Dependency
14
- name: awesome_print
15
+ name: amazing_print
15
16
  requirement: !ruby/object:Gem::Requirement
16
17
  requirements:
17
18
  - - "~>"
18
19
  - !ruby/object:Gem::Version
19
- version: '1.8'
20
+ version: '1.3'
20
21
  type: :runtime
21
22
  prerelease: false
22
23
  version_requirements: !ruby/object:Gem::Requirement
23
24
  requirements:
24
25
  - - "~>"
25
26
  - !ruby/object:Gem::Version
26
- version: '1.8'
27
+ version: '1.3'
27
28
  - !ruby/object:Gem::Dependency
28
- name: htmlentities
29
+ name: faraday
29
30
  requirement: !ruby/object:Gem::Requirement
30
31
  requirements:
31
32
  - - "~>"
32
33
  - !ruby/object:Gem::Version
33
- version: '4.3'
34
+ version: '1.4'
34
35
  type: :runtime
35
36
  prerelease: false
36
37
  version_requirements: !ruby/object:Gem::Requirement
37
38
  requirements:
38
39
  - - "~>"
39
40
  - !ruby/object:Gem::Version
40
- version: '4.3'
41
+ version: '1.4'
41
42
  - !ruby/object:Gem::Dependency
42
- name: httparty
43
+ name: faraday-encoding
43
44
  requirement: !ruby/object:Gem::Requirement
44
45
  requirements:
45
46
  - - "~>"
46
47
  - !ruby/object:Gem::Version
47
- version: '0.16'
48
+ version: '0'
48
49
  type: :runtime
49
50
  prerelease: false
50
51
  version_requirements: !ruby/object:Gem::Requirement
51
52
  requirements:
52
53
  - - "~>"
53
54
  - !ruby/object:Gem::Version
54
- version: '0.16'
55
+ version: '0'
55
56
  - !ruby/object:Gem::Dependency
56
- name: nokogiri
57
+ name: faraday-rate_limiter
57
58
  requirement: !ruby/object:Gem::Requirement
58
59
  requirements:
59
60
  - - "~>"
60
61
  - !ruby/object:Gem::Version
61
- version: '1.8'
62
+ version: '0.0'
62
63
  type: :runtime
63
64
  prerelease: false
64
65
  version_requirements: !ruby/object:Gem::Requirement
65
66
  requirements:
66
67
  - - "~>"
67
68
  - !ruby/object:Gem::Version
68
- version: '1.8'
69
+ version: '0.0'
69
70
  - !ruby/object:Gem::Dependency
70
- name: scripto
71
+ name: hashie
71
72
  requirement: !ruby/object:Gem::Requirement
72
73
  requirements:
73
74
  - - "~>"
74
75
  - !ruby/object:Gem::Version
75
- version: '0'
76
+ version: '4.1'
76
77
  type: :runtime
77
78
  prerelease: false
78
79
  version_requirements: !ruby/object:Gem::Requirement
79
80
  requirements:
80
81
  - - "~>"
81
82
  - !ruby/object:Gem::Version
82
- version: '0'
83
+ version: '4.1'
83
84
  - !ruby/object:Gem::Dependency
84
- name: slop
85
+ name: httpdisk
85
86
  requirement: !ruby/object:Gem::Requirement
86
87
  requirements:
87
88
  - - "~>"
88
89
  - !ruby/object:Gem::Version
89
- version: '4.6'
90
+ version: '0.5'
90
91
  type: :runtime
91
92
  prerelease: false
92
93
  version_requirements: !ruby/object:Gem::Requirement
93
94
  requirements:
94
95
  - - "~>"
95
96
  - !ruby/object:Gem::Version
96
- version: '4.6'
97
+ version: '0.5'
97
98
  - !ruby/object:Gem::Dependency
98
- name: stringex
99
+ name: nokogiri
99
100
  requirement: !ruby/object:Gem::Requirement
100
101
  requirements:
101
102
  - - "~>"
102
103
  - !ruby/object:Gem::Version
103
- version: '2.8'
104
+ version: '1.11'
104
105
  type: :runtime
105
106
  prerelease: false
106
107
  version_requirements: !ruby/object:Gem::Requirement
107
108
  requirements:
108
109
  - - "~>"
109
110
  - !ruby/object:Gem::Version
110
- version: '2.8'
111
+ version: '1.11'
111
112
  - !ruby/object:Gem::Dependency
112
- name: minitest
113
- requirement: !ruby/object:Gem::Requirement
114
- requirements:
115
- - - "~>"
116
- - !ruby/object:Gem::Version
117
- version: '5.11'
118
- type: :development
119
- prerelease: false
120
- version_requirements: !ruby/object:Gem::Requirement
121
- requirements:
122
- - - "~>"
123
- - !ruby/object:Gem::Version
124
- version: '5.11'
125
- - !ruby/object:Gem::Dependency
126
- name: rake
113
+ name: slop
127
114
  requirement: !ruby/object:Gem::Requirement
128
115
  requirements:
129
116
  - - "~>"
130
117
  - !ruby/object:Gem::Version
131
- version: '12.3'
132
- type: :development
118
+ version: '4.8'
119
+ type: :runtime
133
120
  prerelease: false
134
121
  version_requirements: !ruby/object:Gem::Requirement
135
122
  requirements:
136
123
  - - "~>"
137
124
  - !ruby/object:Gem::Version
138
- version: '12.3'
125
+ version: '4.8'
139
126
  - !ruby/object:Gem::Dependency
140
- name: webmock
127
+ name: sterile
141
128
  requirement: !ruby/object:Gem::Requirement
142
129
  requirements:
143
130
  - - "~>"
144
131
  - !ruby/object:Gem::Version
145
- version: '3.4'
146
- type: :development
132
+ version: '1.0'
133
+ type: :runtime
147
134
  prerelease: false
148
135
  version_requirements: !ruby/object:Gem::Requirement
149
136
  requirements:
150
137
  - - "~>"
151
138
  - !ruby/object:Gem::Version
152
- version: '3.4'
139
+ version: '1.0'
153
140
  description: Crawl web sites easily using ruby recipes, with caching and nokogiri.
154
141
  email:
155
142
  - amd@gurge.com
@@ -158,97 +145,48 @@ executables:
158
145
  extensions: []
159
146
  extra_rdoc_files: []
160
147
  files:
148
+ - ".github/workflows/test.yml"
161
149
  - ".gitignore"
162
150
  - ".rubocop.yml"
163
- - ".travis.yml"
164
- - ".vscode/extensions.json"
165
- - ".vscode/settings.json"
166
151
  - Gemfile
152
+ - Gemfile.lock
167
153
  - LICENSE
168
154
  - README.md
169
155
  - Rakefile
170
156
  - bin/sinew
171
157
  - lib/sinew.rb
172
- - lib/sinew/cache.rb
173
- - lib/sinew/core_ext.rb
174
- - lib/sinew/dsl.rb
158
+ - lib/sinew/args.rb
159
+ - lib/sinew/base.rb
160
+ - lib/sinew/csv.rb
175
161
  - lib/sinew/main.rb
162
+ - lib/sinew/middleware/log_formatter.rb
176
163
  - lib/sinew/nokogiri_ext.rb
177
- - lib/sinew/output.rb
178
- - lib/sinew/request.rb
179
164
  - lib/sinew/response.rb
180
- - lib/sinew/runtime_options.rb
181
165
  - lib/sinew/version.rb
166
+ - sample.rb
182
167
  - sample.sinew
183
168
  - sinew.gemspec
184
- - test/legacy/eu.httpbin.org/head/redirect,3
185
- - test/legacy/eu.httpbin.org/head/status,500
186
- - test/legacy/eu.httpbin.org/redirect,3
187
- - test/legacy/eu.httpbin.org/status,500
188
- - test/legacy/legacy.sinew
189
- - test/recipes/array_header.sinew
190
- - test/recipes/basic.sinew
191
- - test/recipes/dups.sinew
192
- - test/recipes/implicit_header.sinew
193
- - test/recipes/limit.sinew
194
- - test/recipes/noko.sinew
195
- - test/recipes/uri.sinew
196
- - test/recipes/xml.sinew
197
- - test/test.html
198
- - test/test_cache.rb
199
- - test/test_helper.rb
200
- - test/test_legacy.rb
201
- - test/test_main.rb
202
- - test/test_nokogiri_ext.rb
203
- - test/test_output.rb
204
- - test/test_recipes.rb
205
- - test/test_requests.rb
206
- - test/test_utf8.rb
207
169
  homepage: http://github.com/gurgeous/sinew
208
170
  licenses:
209
171
  - MIT
210
172
  metadata: {}
211
- post_install_message:
173
+ post_install_message:
212
174
  rdoc_options: []
213
175
  require_paths:
214
176
  - lib
215
177
  required_ruby_version: !ruby/object:Gem::Requirement
216
178
  requirements:
217
- - - "~>"
179
+ - - ">="
218
180
  - !ruby/object:Gem::Version
219
- version: '2.3'
181
+ version: '2.7'
220
182
  required_rubygems_version: !ruby/object:Gem::Requirement
221
183
  requirements:
222
184
  - - ">="
223
185
  - !ruby/object:Gem::Version
224
186
  version: '0'
225
187
  requirements: []
226
- rubyforge_project: sinew
227
- rubygems_version: 2.7.6
228
- signing_key:
188
+ rubygems_version: 3.1.4
189
+ signing_key:
229
190
  specification_version: 4
230
191
  summary: Sinew - structured web crawling using recipes.
231
- test_files:
232
- - test/legacy/eu.httpbin.org/head/redirect,3
233
- - test/legacy/eu.httpbin.org/head/status,500
234
- - test/legacy/eu.httpbin.org/redirect,3
235
- - test/legacy/eu.httpbin.org/status,500
236
- - test/legacy/legacy.sinew
237
- - test/recipes/array_header.sinew
238
- - test/recipes/basic.sinew
239
- - test/recipes/dups.sinew
240
- - test/recipes/implicit_header.sinew
241
- - test/recipes/limit.sinew
242
- - test/recipes/noko.sinew
243
- - test/recipes/uri.sinew
244
- - test/recipes/xml.sinew
245
- - test/test.html
246
- - test/test_cache.rb
247
- - test/test_helper.rb
248
- - test/test_legacy.rb
249
- - test/test_main.rb
250
- - test/test_nokogiri_ext.rb
251
- - test/test_output.rb
252
- - test/test_recipes.rb
253
- - test/test_requests.rb
254
- - test/test_utf8.rb
192
+ test_files: []
data/.travis.yml DELETED
@@ -1,4 +0,0 @@
1
- language: ruby
2
- rvm:
3
- - 2.3.7
4
- - 2.5.1
@@ -1,3 +0,0 @@
1
- {
2
- "recommendations": ["rebornix.Ruby"]
3
- }
@@ -1,15 +0,0 @@
1
- {
2
- "editor.formatOnSave": true,
3
- "editor.formatOnSaveTimeout": 1500,
4
- "editor.tabSize": 2,
5
- "editor.wordSeparators": "`~#$%^&*()-=+[{]}\\|;:'\",.<>/",
6
- "files.associations": {
7
- "*.sinew": "ruby"
8
- },
9
- "files.insertFinalNewline": true,
10
- "files.trimTrailingWhitespace": true,
11
- "ruby.format": "rubocop",
12
- "ruby.lint": {
13
- "rubocop": true
14
- }
15
- }
data/lib/sinew/cache.rb DELETED
@@ -1,79 +0,0 @@
1
- require 'fileutils'
2
- require 'tempfile'
3
-
4
- #
5
- # This class handles the caching of http responses on disk.
6
- #
7
-
8
- module Sinew
9
- class Cache
10
- attr_reader :sinew
11
-
12
- def initialize(sinew)
13
- @sinew = sinew
14
- end
15
-
16
- def get(request)
17
- body = read_if_exist(body_path(request))
18
- return nil if !body
19
-
20
- head = read_if_exist(head_path(request))
21
- Response.from_cache(request, body, head)
22
- end
23
-
24
- def set(response)
25
- body_path = body_path(response.request)
26
- head_path = head_path(response.request)
27
-
28
- FileUtils.mkdir_p(File.dirname(body_path))
29
- FileUtils.mkdir_p(File.dirname(head_path))
30
-
31
- # write body, and head if necessary
32
- atomic_write(body_path, response.body)
33
- if head_necessary?(response)
34
- head = JSON.pretty_generate(response.head_as_json)
35
- atomic_write(head_path, head)
36
- end
37
- end
38
-
39
- def root_dir
40
- sinew.options[:cache]
41
- end
42
- protected :root_dir
43
-
44
- def head_necessary?(response)
45
- response.error? || response.redirected?
46
- end
47
- protected :head_necessary?
48
-
49
- def body_path(request)
50
- "#{root_dir}/#{request.cache_key}"
51
- end
52
- protected :body_path
53
-
54
- def head_path(request)
55
- body_path = body_path(request)
56
- dir, base = File.dirname(body_path), File.basename(body_path)
57
- "#{dir}/head/#{base}"
58
- end
59
- protected :head_path
60
-
61
- def read_if_exist(path)
62
- if File.exist?(path)
63
- IO.read(path, mode: 'r:UTF-8')
64
- end
65
- end
66
- protected :read_if_exist
67
-
68
- def atomic_write(path, data)
69
- tmp = Tempfile.new('sinew', encoding: 'UTF-8')
70
- tmp.write(data)
71
- tmp.close
72
- FileUtils.chmod(0o644, tmp.path)
73
- FileUtils.mv(tmp.path, path)
74
- ensure
75
- FileUtils.rm(tmp.path, force: true)
76
- end
77
- protected :atomic_write
78
- end
79
- end