sinew 2.0.3 → 4.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/test.yml +26 -0
  3. data/.gitignore +3 -5
  4. data/.rubocop.yml +31 -46
  5. data/Gemfile +9 -0
  6. data/Gemfile.lock +124 -0
  7. data/README.md +146 -81
  8. data/Rakefile +36 -20
  9. data/bin/sinew +13 -39
  10. data/lib/sinew.rb +23 -10
  11. data/lib/sinew/args.rb +53 -0
  12. data/lib/sinew/base.rb +251 -0
  13. data/lib/sinew/csv.rb +89 -0
  14. data/lib/sinew/main.rb +45 -98
  15. data/lib/sinew/middleware/log_formatter.rb +23 -0
  16. data/lib/sinew/nokogiri_ext.rb +12 -21
  17. data/lib/sinew/response.rb +39 -99
  18. data/lib/sinew/version.rb +1 -1
  19. data/sample.rb +13 -0
  20. data/sample.sinew +4 -4
  21. data/sinew.gemspec +26 -25
  22. metadata +46 -108
  23. data/.travis.yml +0 -4
  24. data/.vscode/extensions.json +0 -3
  25. data/.vscode/settings.json +0 -15
  26. data/lib/sinew/cache.rb +0 -79
  27. data/lib/sinew/core_ext.rb +0 -59
  28. data/lib/sinew/dsl.rb +0 -114
  29. data/lib/sinew/output.rb +0 -149
  30. data/lib/sinew/request.rb +0 -151
  31. data/lib/sinew/runtime_options.rb +0 -28
  32. data/test/legacy/eu.httpbin.org/head/redirect,3 +0 -51
  33. data/test/legacy/eu.httpbin.org/head/status,500 +0 -1
  34. data/test/legacy/eu.httpbin.org/redirect,3 +0 -11
  35. data/test/legacy/eu.httpbin.org/status,500 +0 -1
  36. data/test/legacy/legacy.sinew +0 -2
  37. data/test/recipes/array_header.sinew +0 -6
  38. data/test/recipes/basic.sinew +0 -8
  39. data/test/recipes/dups.sinew +0 -7
  40. data/test/recipes/implicit_header.sinew +0 -5
  41. data/test/recipes/limit.sinew +0 -11
  42. data/test/recipes/noko.sinew +0 -9
  43. data/test/recipes/uri.sinew +0 -11
  44. data/test/recipes/xml.sinew +0 -8
  45. data/test/test.html +0 -45
  46. data/test/test_cache.rb +0 -69
  47. data/test/test_helper.rb +0 -123
  48. data/test/test_legacy.rb +0 -23
  49. data/test/test_main.rb +0 -34
  50. data/test/test_nokogiri_ext.rb +0 -18
  51. data/test/test_output.rb +0 -56
  52. data/test/test_recipes.rb +0 -60
  53. data/test/test_requests.rb +0 -135
  54. data/test/test_utf8.rb +0 -39
data/lib/sinew/version.rb CHANGED
@@ -1,4 +1,4 @@
1
1
  module Sinew
2
2
  # Gem version
3
- VERSION = '2.0.3'.freeze
3
+ VERSION = '4.0.0'.freeze
4
4
  end
data/sample.rb ADDED
@@ -0,0 +1,13 @@
1
+ require_relative 'lib/sinew'
2
+
3
+ sinew = Sinew.new(output: 'sample.csv', verbose: true)
4
+
5
+ response = sinew.get 'http://httpbingo.org'
6
+ response.noko.css('ul li a').each do |a|
7
+ row = {}
8
+ row[:url] = a[:href]
9
+ row[:title] = a.text
10
+ sinew.csv_emit(row)
11
+ end
12
+
13
+ sinew.get 'http://httpbingo.org/redirect/2'
data/sample.sinew CHANGED
@@ -1,9 +1,9 @@
1
- get 'http://httpbin.org'
2
- noko.css('ul li a').each do |a|
1
+ response = sinew.get 'http://httpbingo.org'
2
+ response.noko.css('ul li a').each do |a|
3
3
  row = {}
4
4
  row[:url] = a[:href]
5
5
  row[:title] = a.text
6
- csv_emit(row)
6
+ sinew.csv_emit(row)
7
7
  end
8
8
 
9
- get 'http://httpbin.org/redirect/2'
9
+ sinew.get 'http://httpbingo.org/redirect/2'
data/sinew.gemspec CHANGED
@@ -3,32 +3,33 @@ $LOAD_PATH.unshift("#{__dir__}/lib")
3
3
  require 'sinew/version'
4
4
 
5
5
  Gem::Specification.new do |s|
6
- s.name = 'sinew'
7
- s.version = Sinew::VERSION
8
- s.platform = Gem::Platform::RUBY
9
- s.license = 'MIT'
10
- s.authors = [ 'Adam Doppelt' ]
11
- s.email = [ 'amd@gurge.com' ]
12
- s.homepage = 'http://github.com/gurgeous/sinew'
13
- s.summary = 'Sinew - structured web crawling using recipes.'
14
- s.description = 'Crawl web sites easily using ruby recipes, with caching and nokogiri.'
15
- s.required_ruby_version = '~> 2.3'
6
+ s.name = 'sinew'
7
+ s.version = Sinew::VERSION
8
+ s.authors = ['Adam Doppelt', 'Nathan Kriege']
9
+ s.email = ['amd@gurge.com']
16
10
 
17
- s.rubyforge_project = 'sinew'
11
+ s.summary = 'Sinew - structured web crawling using recipes.'
12
+ s.description = 'Crawl web sites easily using ruby recipes, with caching and nokogiri.'
13
+ s.homepage = 'http://github.com/gurgeous/sinew'
14
+ s.license = 'MIT'
15
+ s.required_ruby_version = '>= 2.7'
18
16
 
19
- s.add_runtime_dependency 'awesome_print', '~> 1.8'
20
- s.add_runtime_dependency 'htmlentities', '~> 4.3'
21
- s.add_runtime_dependency 'httparty', '~> 0.16'
22
- s.add_runtime_dependency 'nokogiri', '~> 1.8'
23
- s.add_runtime_dependency 'scripto', '~> 0'
24
- s.add_runtime_dependency 'slop', '~> 4.6'
25
- s.add_runtime_dependency 'stringex', '~> 2.8'
26
- s.add_development_dependency 'minitest', '~> 5.11'
27
- s.add_development_dependency 'rake', '~> 12.3'
28
- s.add_development_dependency 'webmock', '~> 3.4'
17
+ # what's in the gem?
18
+ s.files = Dir.chdir(File.expand_path(__dir__)) do
19
+ `git ls-files -z`.split("\x0").reject { _1.match(%r{^test/}) }
20
+ end
21
+ s.bindir = 'bin'
22
+ s.executables = s.files.grep(%r{^#{s.bindir}/}) { File.basename(_1) }
23
+ s.require_paths = ['lib']
29
24
 
30
- s.files = `git ls-files`.split("\n")
31
- s.test_files = `git ls-files -- test/*`.split("\n")
32
- s.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
33
- s.require_paths = [ 'lib' ]
25
+ # gem dependencies
26
+ s.add_dependency 'amazing_print', '~> 1.3'
27
+ s.add_dependency 'faraday', '~> 1.4'
28
+ s.add_dependency 'faraday-encoding', '~> 0'
29
+ s.add_dependency 'faraday-rate_limiter', '~> 0.0'
30
+ s.add_dependency 'hashie', '~> 4.1'
31
+ s.add_dependency 'httpdisk', '~> 0.5'
32
+ s.add_dependency 'nokogiri', '~> 1.11'
33
+ s.add_dependency 'slop', '~> 4.8'
34
+ s.add_dependency 'sterile', '~> 1.0'
34
35
  end
metadata CHANGED
@@ -1,155 +1,142 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sinew
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.3
4
+ version: 4.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Adam Doppelt
8
- autorequire:
8
+ - Nathan Kriege
9
+ autorequire:
9
10
  bindir: bin
10
11
  cert_chain: []
11
- date: 2018-05-15 00:00:00.000000000 Z
12
+ date: 2021-07-09 00:00:00.000000000 Z
12
13
  dependencies:
13
14
  - !ruby/object:Gem::Dependency
14
- name: awesome_print
15
+ name: amazing_print
15
16
  requirement: !ruby/object:Gem::Requirement
16
17
  requirements:
17
18
  - - "~>"
18
19
  - !ruby/object:Gem::Version
19
- version: '1.8'
20
+ version: '1.3'
20
21
  type: :runtime
21
22
  prerelease: false
22
23
  version_requirements: !ruby/object:Gem::Requirement
23
24
  requirements:
24
25
  - - "~>"
25
26
  - !ruby/object:Gem::Version
26
- version: '1.8'
27
+ version: '1.3'
27
28
  - !ruby/object:Gem::Dependency
28
- name: htmlentities
29
+ name: faraday
29
30
  requirement: !ruby/object:Gem::Requirement
30
31
  requirements:
31
32
  - - "~>"
32
33
  - !ruby/object:Gem::Version
33
- version: '4.3'
34
+ version: '1.4'
34
35
  type: :runtime
35
36
  prerelease: false
36
37
  version_requirements: !ruby/object:Gem::Requirement
37
38
  requirements:
38
39
  - - "~>"
39
40
  - !ruby/object:Gem::Version
40
- version: '4.3'
41
+ version: '1.4'
41
42
  - !ruby/object:Gem::Dependency
42
- name: httparty
43
+ name: faraday-encoding
43
44
  requirement: !ruby/object:Gem::Requirement
44
45
  requirements:
45
46
  - - "~>"
46
47
  - !ruby/object:Gem::Version
47
- version: '0.16'
48
+ version: '0'
48
49
  type: :runtime
49
50
  prerelease: false
50
51
  version_requirements: !ruby/object:Gem::Requirement
51
52
  requirements:
52
53
  - - "~>"
53
54
  - !ruby/object:Gem::Version
54
- version: '0.16'
55
+ version: '0'
55
56
  - !ruby/object:Gem::Dependency
56
- name: nokogiri
57
+ name: faraday-rate_limiter
57
58
  requirement: !ruby/object:Gem::Requirement
58
59
  requirements:
59
60
  - - "~>"
60
61
  - !ruby/object:Gem::Version
61
- version: '1.8'
62
+ version: '0.0'
62
63
  type: :runtime
63
64
  prerelease: false
64
65
  version_requirements: !ruby/object:Gem::Requirement
65
66
  requirements:
66
67
  - - "~>"
67
68
  - !ruby/object:Gem::Version
68
- version: '1.8'
69
+ version: '0.0'
69
70
  - !ruby/object:Gem::Dependency
70
- name: scripto
71
+ name: hashie
71
72
  requirement: !ruby/object:Gem::Requirement
72
73
  requirements:
73
74
  - - "~>"
74
75
  - !ruby/object:Gem::Version
75
- version: '0'
76
+ version: '4.1'
76
77
  type: :runtime
77
78
  prerelease: false
78
79
  version_requirements: !ruby/object:Gem::Requirement
79
80
  requirements:
80
81
  - - "~>"
81
82
  - !ruby/object:Gem::Version
82
- version: '0'
83
+ version: '4.1'
83
84
  - !ruby/object:Gem::Dependency
84
- name: slop
85
+ name: httpdisk
85
86
  requirement: !ruby/object:Gem::Requirement
86
87
  requirements:
87
88
  - - "~>"
88
89
  - !ruby/object:Gem::Version
89
- version: '4.6'
90
+ version: '0.5'
90
91
  type: :runtime
91
92
  prerelease: false
92
93
  version_requirements: !ruby/object:Gem::Requirement
93
94
  requirements:
94
95
  - - "~>"
95
96
  - !ruby/object:Gem::Version
96
- version: '4.6'
97
+ version: '0.5'
97
98
  - !ruby/object:Gem::Dependency
98
- name: stringex
99
+ name: nokogiri
99
100
  requirement: !ruby/object:Gem::Requirement
100
101
  requirements:
101
102
  - - "~>"
102
103
  - !ruby/object:Gem::Version
103
- version: '2.8'
104
+ version: '1.11'
104
105
  type: :runtime
105
106
  prerelease: false
106
107
  version_requirements: !ruby/object:Gem::Requirement
107
108
  requirements:
108
109
  - - "~>"
109
110
  - !ruby/object:Gem::Version
110
- version: '2.8'
111
+ version: '1.11'
111
112
  - !ruby/object:Gem::Dependency
112
- name: minitest
113
- requirement: !ruby/object:Gem::Requirement
114
- requirements:
115
- - - "~>"
116
- - !ruby/object:Gem::Version
117
- version: '5.11'
118
- type: :development
119
- prerelease: false
120
- version_requirements: !ruby/object:Gem::Requirement
121
- requirements:
122
- - - "~>"
123
- - !ruby/object:Gem::Version
124
- version: '5.11'
125
- - !ruby/object:Gem::Dependency
126
- name: rake
113
+ name: slop
127
114
  requirement: !ruby/object:Gem::Requirement
128
115
  requirements:
129
116
  - - "~>"
130
117
  - !ruby/object:Gem::Version
131
- version: '12.3'
132
- type: :development
118
+ version: '4.8'
119
+ type: :runtime
133
120
  prerelease: false
134
121
  version_requirements: !ruby/object:Gem::Requirement
135
122
  requirements:
136
123
  - - "~>"
137
124
  - !ruby/object:Gem::Version
138
- version: '12.3'
125
+ version: '4.8'
139
126
  - !ruby/object:Gem::Dependency
140
- name: webmock
127
+ name: sterile
141
128
  requirement: !ruby/object:Gem::Requirement
142
129
  requirements:
143
130
  - - "~>"
144
131
  - !ruby/object:Gem::Version
145
- version: '3.4'
146
- type: :development
132
+ version: '1.0'
133
+ type: :runtime
147
134
  prerelease: false
148
135
  version_requirements: !ruby/object:Gem::Requirement
149
136
  requirements:
150
137
  - - "~>"
151
138
  - !ruby/object:Gem::Version
152
- version: '3.4'
139
+ version: '1.0'
153
140
  description: Crawl web sites easily using ruby recipes, with caching and nokogiri.
154
141
  email:
155
142
  - amd@gurge.com
@@ -158,97 +145,48 @@ executables:
158
145
  extensions: []
159
146
  extra_rdoc_files: []
160
147
  files:
148
+ - ".github/workflows/test.yml"
161
149
  - ".gitignore"
162
150
  - ".rubocop.yml"
163
- - ".travis.yml"
164
- - ".vscode/extensions.json"
165
- - ".vscode/settings.json"
166
151
  - Gemfile
152
+ - Gemfile.lock
167
153
  - LICENSE
168
154
  - README.md
169
155
  - Rakefile
170
156
  - bin/sinew
171
157
  - lib/sinew.rb
172
- - lib/sinew/cache.rb
173
- - lib/sinew/core_ext.rb
174
- - lib/sinew/dsl.rb
158
+ - lib/sinew/args.rb
159
+ - lib/sinew/base.rb
160
+ - lib/sinew/csv.rb
175
161
  - lib/sinew/main.rb
162
+ - lib/sinew/middleware/log_formatter.rb
176
163
  - lib/sinew/nokogiri_ext.rb
177
- - lib/sinew/output.rb
178
- - lib/sinew/request.rb
179
164
  - lib/sinew/response.rb
180
- - lib/sinew/runtime_options.rb
181
165
  - lib/sinew/version.rb
166
+ - sample.rb
182
167
  - sample.sinew
183
168
  - sinew.gemspec
184
- - test/legacy/eu.httpbin.org/head/redirect,3
185
- - test/legacy/eu.httpbin.org/head/status,500
186
- - test/legacy/eu.httpbin.org/redirect,3
187
- - test/legacy/eu.httpbin.org/status,500
188
- - test/legacy/legacy.sinew
189
- - test/recipes/array_header.sinew
190
- - test/recipes/basic.sinew
191
- - test/recipes/dups.sinew
192
- - test/recipes/implicit_header.sinew
193
- - test/recipes/limit.sinew
194
- - test/recipes/noko.sinew
195
- - test/recipes/uri.sinew
196
- - test/recipes/xml.sinew
197
- - test/test.html
198
- - test/test_cache.rb
199
- - test/test_helper.rb
200
- - test/test_legacy.rb
201
- - test/test_main.rb
202
- - test/test_nokogiri_ext.rb
203
- - test/test_output.rb
204
- - test/test_recipes.rb
205
- - test/test_requests.rb
206
- - test/test_utf8.rb
207
169
  homepage: http://github.com/gurgeous/sinew
208
170
  licenses:
209
171
  - MIT
210
172
  metadata: {}
211
- post_install_message:
173
+ post_install_message:
212
174
  rdoc_options: []
213
175
  require_paths:
214
176
  - lib
215
177
  required_ruby_version: !ruby/object:Gem::Requirement
216
178
  requirements:
217
- - - "~>"
179
+ - - ">="
218
180
  - !ruby/object:Gem::Version
219
- version: '2.3'
181
+ version: '2.7'
220
182
  required_rubygems_version: !ruby/object:Gem::Requirement
221
183
  requirements:
222
184
  - - ">="
223
185
  - !ruby/object:Gem::Version
224
186
  version: '0'
225
187
  requirements: []
226
- rubyforge_project: sinew
227
- rubygems_version: 2.7.6
228
- signing_key:
188
+ rubygems_version: 3.1.4
189
+ signing_key:
229
190
  specification_version: 4
230
191
  summary: Sinew - structured web crawling using recipes.
231
- test_files:
232
- - test/legacy/eu.httpbin.org/head/redirect,3
233
- - test/legacy/eu.httpbin.org/head/status,500
234
- - test/legacy/eu.httpbin.org/redirect,3
235
- - test/legacy/eu.httpbin.org/status,500
236
- - test/legacy/legacy.sinew
237
- - test/recipes/array_header.sinew
238
- - test/recipes/basic.sinew
239
- - test/recipes/dups.sinew
240
- - test/recipes/implicit_header.sinew
241
- - test/recipes/limit.sinew
242
- - test/recipes/noko.sinew
243
- - test/recipes/uri.sinew
244
- - test/recipes/xml.sinew
245
- - test/test.html
246
- - test/test_cache.rb
247
- - test/test_helper.rb
248
- - test/test_legacy.rb
249
- - test/test_main.rb
250
- - test/test_nokogiri_ext.rb
251
- - test/test_output.rb
252
- - test/test_recipes.rb
253
- - test/test_requests.rb
254
- - test/test_utf8.rb
192
+ test_files: []
data/.travis.yml DELETED
@@ -1,4 +0,0 @@
1
- language: ruby
2
- rvm:
3
- - 2.3.7
4
- - 2.5.1
@@ -1,3 +0,0 @@
1
- {
2
- "recommendations": ["rebornix.Ruby"]
3
- }
@@ -1,15 +0,0 @@
1
- {
2
- "editor.formatOnSave": true,
3
- "editor.formatOnSaveTimeout": 1500,
4
- "editor.tabSize": 2,
5
- "editor.wordSeparators": "`~#$%^&*()-=+[{]}\\|;:'\",.<>/",
6
- "files.associations": {
7
- "*.sinew": "ruby"
8
- },
9
- "files.insertFinalNewline": true,
10
- "files.trimTrailingWhitespace": true,
11
- "ruby.format": "rubocop",
12
- "ruby.lint": {
13
- "rubocop": true
14
- }
15
- }
data/lib/sinew/cache.rb DELETED
@@ -1,79 +0,0 @@
1
- require 'fileutils'
2
- require 'tempfile'
3
-
4
- #
5
- # This class handles the caching of http responses on disk.
6
- #
7
-
8
- module Sinew
9
- class Cache
10
- attr_reader :sinew
11
-
12
- def initialize(sinew)
13
- @sinew = sinew
14
- end
15
-
16
- def get(request)
17
- body = read_if_exist(body_path(request))
18
- return nil if !body
19
-
20
- head = read_if_exist(head_path(request))
21
- Response.from_cache(request, body, head)
22
- end
23
-
24
- def set(response)
25
- body_path = body_path(response.request)
26
- head_path = head_path(response.request)
27
-
28
- FileUtils.mkdir_p(File.dirname(body_path))
29
- FileUtils.mkdir_p(File.dirname(head_path))
30
-
31
- # write body, and head if necessary
32
- atomic_write(body_path, response.body)
33
- if head_necessary?(response)
34
- head = JSON.pretty_generate(response.head_as_json)
35
- atomic_write(head_path, head)
36
- end
37
- end
38
-
39
- def root_dir
40
- sinew.options[:cache]
41
- end
42
- protected :root_dir
43
-
44
- def head_necessary?(response)
45
- response.error? || response.redirected?
46
- end
47
- protected :head_necessary?
48
-
49
- def body_path(request)
50
- "#{root_dir}/#{request.cache_key}"
51
- end
52
- protected :body_path
53
-
54
- def head_path(request)
55
- body_path = body_path(request)
56
- dir, base = File.dirname(body_path), File.basename(body_path)
57
- "#{dir}/head/#{base}"
58
- end
59
- protected :head_path
60
-
61
- def read_if_exist(path)
62
- if File.exist?(path)
63
- IO.read(path, mode: 'r:UTF-8')
64
- end
65
- end
66
- protected :read_if_exist
67
-
68
- def atomic_write(path, data)
69
- tmp = Tempfile.new('sinew', encoding: 'UTF-8')
70
- tmp.write(data)
71
- tmp.close
72
- FileUtils.chmod(0o644, tmp.path)
73
- FileUtils.mv(tmp.path, path)
74
- ensure
75
- FileUtils.rm(tmp.path, force: true)
76
- end
77
- protected :atomic_write
78
- end
79
- end