curlyq 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/curlyq ADDED
@@ -0,0 +1,477 @@
1
+ #!/usr/bin/env ruby
2
+ require 'gli'
3
+ require 'curly'
4
+ require 'curly/curl'
5
+
6
+ include GLI::App
7
+
8
+ program_desc 'A scriptable interface to curl'
9
+
10
+ version Curly::VERSION
11
+
12
+ subcommand_option_handling :normal
13
+ arguments :strict
14
+
15
+ ImageType = Class.new(Symbol)
16
+ accept ImageType do |value|
17
+ value.normalize_image_type(:all)
18
+ end
19
+
20
+ BrowserType = Class.new(Symbol)
21
+ accept BrowserType do |value|
22
+ value.normalize_browser_type(:none)
23
+ end
24
+
25
+ ScreenshotType = Class.new(Symbol)
26
+ accept ScreenshotType do |value|
27
+ value.normalize_screenshot_type(:full_page)
28
+ end
29
+
30
+ desc 'Output YAML instead of json'
31
+ switch %i[y yaml]
32
+
33
+ desc 'Output "pretty" JSON'
34
+ switch %i[pretty], default_value: true, negatable: true
35
+
36
+ # TODO: Post method, html and json with --data flags, accept key=value and files
37
+ # TODO: Handle binary responses, deal gracefully with compressed data
38
+ # TODO: File uploads?
39
+
40
+ def self.break_headers(headers)
41
+ out = {}
42
+ headers.each do |h|
43
+ m = h.match(/(?<key>[^=]+)=(?<value>.*?)$/)
44
+ out[m['key'].strip] = m['value'].strip
45
+ end
46
+ out
47
+ end
48
+
49
+ def self.print_out(output, yaml, raw: false, pretty: true)
50
+ output = output.to_data if output.respond_to?(:to_data)
51
+ # Was intended to flatten single responses, but not getting an array back is unpredictable
52
+ # output = output[0] if output&.is_a?(Array) && output.count == 1
53
+ if output.is_a?(String)
54
+ print output
55
+ elsif raw
56
+ output = output.join("\n") if output.is_a?(Array)
57
+ print output
58
+ else
59
+ if yaml
60
+ print YAML.dump(output)
61
+ else
62
+ print pretty ? JSON.pretty_generate(output) : JSON.generate(output)
63
+ end
64
+ end
65
+ end
66
+
67
+ desc 'Curl URL and output its elements, multiple URLs allowed'
68
+ arg_name 'URL', multiple: true
69
+ command %i[html curl] do |c|
70
+ c.desc 'Only retrieve headers/info'
71
+ c.switch %i[I info], negatable: false
72
+
73
+ c.desc 'Regurn an array of matches to a CSS or XPath query'
74
+ c.flag %i[search]
75
+
76
+ c.desc 'Define a header to send as "key=value"'
77
+ c.flag %i[h header], multiple: true
78
+
79
+ c.desc 'Use a browser to retrieve a dynamic web page (firefox, chrome)'
80
+ c.flag %i[b browser], type: BrowserType, must_match: /^[fc].*?$/
81
+
82
+ c.desc %(If curl doesn't work, use a fallback browser (firefox, chrome))
83
+ c.flag %i[f fallback], type: BrowserType, must_match: /^[fc].*?$/
84
+
85
+ c.desc 'Expect compressed results'
86
+ c.switch %i[c compressed], negatable: false
87
+
88
+ c.desc 'Remove extra whitespace from results'
89
+ c.switch %i[clean]
90
+
91
+ c.desc 'Filter output using dot-syntax path'
92
+ c.flag %i[q query filter]
93
+
94
+ c.desc 'Output a raw value for a key'
95
+ c.flag %i[r raw]
96
+
97
+ c.desc 'Ignore relative hrefs when gathering content links'
98
+ c.switch %i[ignore_relative], negatable: true
99
+
100
+ c.desc 'Ignore fragment hrefs when gathering content links'
101
+ c.switch %i[ignore_fragments], negatable: true
102
+
103
+ c.desc 'Only gather external links'
104
+ c.switch %i[x external_links_only], default_value: false, negatable: false
105
+
106
+ c.action do |global_options, options, args|
107
+ urls = args.join(' ').split(/[, ]+/)
108
+ headers = break_headers(options[:header])
109
+
110
+ output = []
111
+
112
+ urls.each do |url|
113
+ res = Curl::Html.new(url, browser: options[:browser], fallback: options[:fallback],
114
+ headers: headers, headers_only: options[:info],
115
+ compressed: options[:compressed], clean: options[:clean],
116
+ ignore_local_links: options[:ignore_relative],
117
+ ignore_fragment_links: options[:ignore_fragments],
118
+ external_links_only: options[:external_links_only])
119
+
120
+ if options[:info]
121
+ output.push(res.headers)
122
+ # print_out(res.headers, global_options[:yaml], raw: options[:raw], pretty: global_options[:pretty])
123
+ next
124
+ end
125
+
126
+ if options[:search]
127
+ out = res.search(options[:search])
128
+
129
+ out = out.dot_query(options[:query]) if options[:query]
130
+ output.push(out)
131
+ elsif options[:query]
132
+ queried = res.to_data.dot_query(options[:query])
133
+ output.push(queried) if queried
134
+ else
135
+ output.push(res.to_data(url: url))
136
+ end
137
+ end
138
+
139
+ output.delete_if(&:empty?)
140
+ output = output[0] if output.count == 1
141
+ output.map! { |o| o[options[:raw].to_sym] } if options[:raw]
142
+
143
+ print_out(output, global_options[:yaml], raw: options[:raw], pretty: global_options[:pretty])
144
+ end
145
+ end
146
+
147
+ desc 'Save a screenshot of a URL'
148
+ arg_name 'URL', multiple: true
149
+ command :screenshot do |c|
150
+ c.desc 'Type of screenshot to save (full (requires firefox), print, visible)'
151
+ c.flag %i[t type], type: ScreenshotType, must_match: /^[fpv].*?$/, default_value: 'full'
152
+
153
+ c.desc 'Browser to use (firefox, chrome)'
154
+ c.flag %i[b browser], type: BrowserType, must_match: /^[fc].*?$/, default_value: 'chrome'
155
+
156
+ c.desc 'File destination'
157
+ c.flag %i[o out file]
158
+
159
+ c.action do |_, options, args|
160
+ urls = args.join(' ').split(/[, ]+/)
161
+
162
+ urls.each do |url|
163
+ c = Curl::Html.new(url)
164
+ c.screenshot(options[:out], browser: options[:browser], type: options[:type])
165
+ end
166
+ end
167
+ end
168
+
169
+ desc 'Get a JSON response from a URL, multiple URLs allowed'
170
+ arg_name 'URL', multiple: true
171
+ command :json do |c|
172
+ c.desc 'Define a header to send as key=value'
173
+ c.flag %i[h header], multiple: true
174
+
175
+ c.desc 'Expect compressed results'
176
+ c.switch %i[c compressed]
177
+
178
+ c.desc 'Filter output using dot-syntax path'
179
+ c.flag %i[q query filter]
180
+
181
+ c.action do |global_options, options, args|
182
+ urls = args.join(' ').split(/[, ]+/)
183
+ headers = break_headers(options[:header])
184
+
185
+ output = []
186
+
187
+ urls.each do |url|
188
+ res = Curl::Json.new(url, headers: headers, compressed: options[:compressed], symbolize_names: false)
189
+
190
+ json = res.json
191
+
192
+ if json.nil?
193
+ output.push({
194
+ status: 'error parsing JSON',
195
+ url: res.url,
196
+ code: res.code,
197
+ headers: res.headers
198
+ })
199
+ else
200
+ json = json.dot_query(options[:query]) if options[:query]
201
+
202
+ output.push(json)
203
+ end
204
+ end
205
+
206
+ print_out(output, global_options[:yaml], pretty: global_options[:pretty])
207
+ end
208
+ end
209
+
210
+ desc 'Extract contents between two regular expressions'
211
+ arg_name 'URL', multiple: true
212
+ command :extract do |c|
213
+ c.desc 'Text before extraction, parsed as regex'
214
+ c.flag %i[b before]
215
+
216
+ c.desc 'Text after extraction, parsed as regex'
217
+ c.flag %i[a after]
218
+
219
+ c.desc 'Define a header to send as key=value'
220
+ c.flag %i[h header], multiple: true
221
+
222
+ c.desc 'Expect compressed results'
223
+ c.switch %i[c compressed]
224
+
225
+ c.desc 'Remove extra whitespace from results'
226
+ c.switch %i[clean]
227
+
228
+ c.desc 'Strip HTML tags from results'
229
+ c.switch %i[strip]
230
+
231
+ c.action do |global_options,options,args|
232
+ urls = args.join(' ').split(/[, ]+/)
233
+ headers = break_headers(options[:header])
234
+
235
+ output = []
236
+
237
+ urls.each do |url|
238
+ res = Curl::Html.new(url, headers: headers, headers_only: false,
239
+ compressed: options[:compressed], clean: options[:clean])
240
+ extracted = res.extract(options[:before], options[:after])
241
+ extracted.strip_tags! if options[:strip]
242
+ output.concat(extracted)
243
+ end
244
+
245
+ print_out(output, global_options[:yaml], pretty: global_options[:pretty])
246
+ end
247
+ end
248
+
249
+ desc 'Extract all instances of a tag'
250
+ arg_name 'URL', multiple: true
251
+ command :tags do |c|
252
+ c.desc 'Define a header to send as key=value'
253
+ c.flag %i[h header], multiple: true
254
+
255
+ c.desc 'Specify a tag to collect'
256
+ c.flag %i[t tag], multiple: true
257
+
258
+ c.desc 'Expect compressed results'
259
+ c.switch %i[c compressed]
260
+
261
+ c.desc 'Remove extra whitespace from results'
262
+ c.switch %i[clean]
263
+
264
+ c.desc 'CSS/XPath query'
265
+ c.flag %i[q query search]
266
+
267
+ c.action do |global_options, options, args|
268
+ urls = args.join(' ').split(/[, ]+/)
269
+ headers = break_headers(options[:header])
270
+ tags = options[:tag].join(' ').split(/[, ]+/)
271
+ output = []
272
+
273
+ urls.each do |url|
274
+ res = Curl::Html.new(url, headers: headers, headers_only: options[:headers],
275
+ compressed: options[:compressed], clean: options[:clean])
276
+ output = []
277
+ if options[:search]
278
+ output = res.tags.search(options[:search])
279
+ elsif tags.count.positive?
280
+ tags.each { |tag| output.concat(res.tags(tag)) }
281
+ else
282
+ output.concat(res.tags)
283
+ end
284
+ end
285
+
286
+ print_out(output, global_options[:yaml], pretty: global_options[:pretty])
287
+ end
288
+ end
289
+
290
+ desc 'Extract all images from a URL'
291
+ arg_name 'URL', multiple: true
292
+ command :images do |c|
293
+ c.desc 'Type of images to return (img, srcset, opengraph, all)'
294
+ c.flag %i[t type], multiple: true, type: ImageType, default_value: ['all']
295
+
296
+ c.desc 'Expect compressed results'
297
+ c.switch %i[c compressed]
298
+
299
+ c.desc 'Remove extra whitespace from results'
300
+ c.switch %i[clean]
301
+
302
+ c.action do |global_options, options, args|
303
+ urls = args.join(' ').split(/[, ]+/)
304
+
305
+ output = []
306
+
307
+ types = options[:type].join(' ').split(/[ ,]+/).map(&:normalize_image_type)
308
+
309
+ urls.each do |url|
310
+ res = Curl::Html.new(url, compressed: options[:compressed], clean: options[:clean])
311
+ output.concat(res.images(types: types))
312
+ end
313
+
314
+ print_out(output, global_options[:yaml], pretty: global_options[:pretty])
315
+ end
316
+ end
317
+
318
+ desc %(Return all links on a URL's page)
319
+ arg_name 'URL', multiple: true
320
+ command :links do |c|
321
+ c.desc 'Ignore relative hrefs when gathering content links'
322
+ c.switch %i[ignore_relative], negatable: true
323
+
324
+ c.desc 'Ignore fragment hrefs when gathering content links'
325
+ c.switch %i[ignore_fragments], negatable: true
326
+
327
+ c.desc 'Only gather external links'
328
+ c.switch %i[x external_links_only], default_value: false, negatable: false
329
+
330
+ c.desc 'Filter output using dot-syntax path'
331
+ c.flag %i[q query filter]
332
+
333
+ c.desc 'Filter out duplicate links, preserving only first one'
334
+ c.switch %i[d dedup], negatable: true
335
+
336
+ c.action do |global_options, options, args|
337
+ urls = args.join(' ').split(/[, ]+/)
338
+
339
+ output = []
340
+
341
+ urls.each do |url|
342
+ res = Curl::Html.new(url, compressed: options[:compressed], clean: options[:clean],
343
+ ignore_local_links: options[:ignore_relative],
344
+ ignore_fragment_links: options[:ignore_fragments],
345
+ external_links_only: options[:external_links_only])
346
+
347
+ if options[:query]
348
+ query = options[:query] =~ /^links/ ? options[:query] : "links#{options[:query]}"
349
+ queried = { links: res.to_data[:links] }.dot_query(query)
350
+ output.concat(queried) if queried
351
+ else
352
+ output.concat(res.body_links)
353
+ end
354
+ end
355
+
356
+ output.dedup_links! if options[:dedup]
357
+
358
+ print_out(output, global_options[:yaml], pretty: global_options[:pretty])
359
+ end
360
+ end
361
+
362
+ desc %(Return all <head> links on URL's page)
363
+ arg_name 'URL', multiple: true
364
+ command :headlinks do |c|
365
+ c.desc 'Filter output using dot-syntax path'
366
+ c.flag %i[q query filter]
367
+
368
+ c.action do |global_options, options, args|
369
+ urls = args.join(' ').split(/[, ]+/)
370
+
371
+ output = []
372
+
373
+ urls.each do |url|
374
+ res = Curl::Html.new(url, compressed: options[:compressed], clean: options[:clean])
375
+
376
+ if options[:query]
377
+ query = options[:query] =~ /^links/ ? options[:query] : "links#{options[:query]}"
378
+ queried = { links: res.to_data[:meta_links] }.dot_query(query)
379
+ output.concat(queried) if queried
380
+ else
381
+ output.push(res.to_data[:meta_links])
382
+ end
383
+ end
384
+
385
+ output = output[0] if output.count == 1
386
+
387
+ print_out(output, global_options[:yaml], pretty: global_options[:pretty])
388
+ end
389
+ end
390
+
391
+ desc %(Scrape a page using a web browser, for dynamic (JS) pages. Be sure to have the selected --browser installed.)
392
+ arg_name 'URL', multiple: true
393
+ command :scrape do |c|
394
+ c.desc 'Browser to use (firefox, chrome)'
395
+ c.flag %i[b browser], type: BrowserType
396
+
397
+ c.desc 'Regurn an array of matches to a CSS or XPath query'
398
+ c.flag %i[search]
399
+
400
+ c.desc 'Define a header to send as "key=value"'
401
+ c.flag %i[h header], multiple: true
402
+
403
+ c.desc 'Remove extra whitespace from results'
404
+ c.switch %i[clean]
405
+
406
+ c.desc 'Filter output using dot-syntax path'
407
+ c.flag %i[q query filter]
408
+
409
+ c.desc 'Output a raw value for a key'
410
+ c.flag %i[r raw]
411
+
412
+ c.action do |global_options, options, args|
413
+ urls = args.join(' ').split(/[, ]+/)
414
+
415
+ output = []
416
+
417
+ urls.each do |url|
418
+ driver = Selenium::WebDriver.for options[:browser]
419
+ begin
420
+ driver.get url
421
+ res = driver.page_source
422
+
423
+ res = Curl::Html.new(nil, source: res, clean: options[:clean])
424
+ if options[:search]
425
+ out = res.search(options[:search])
426
+
427
+ out = out.dot_query(options[:query]) if options[:query]
428
+ output.push(out)
429
+ elsif options[:query]
430
+ queried = res.to_data(url: url).dot_query(options[:query])
431
+ output = queried if queried
432
+ else
433
+ output.push(res.to_data(url: url))
434
+ end
435
+
436
+ # elements = driver.find_elements(css: options[:query])
437
+
438
+ # elements.each { |e| output.push(e.text.strip) }
439
+ ensure
440
+ driver.quit
441
+ end
442
+ end
443
+
444
+ output.delete_if(&:empty?)
445
+
446
+ output = output[0] if output.count == 1
447
+
448
+ if options[:raw]
449
+ output.map! { |o| o[options[:raw].to_sym] }
450
+ end
451
+
452
+ print_out(output, global_options[:yaml], raw: options[:raw], pretty: global_options[:pretty])
453
+ end
454
+ end
455
+
456
+ pre do |global, command, options, args|
457
+ # Pre logic here
458
+ # Return true to proceed; false to abort and not call the
459
+ # chosen command
460
+ # Use skips_pre before a command to skip this block
461
+ # on that command only
462
+ true
463
+ end
464
+
465
+ post do |global, command, options, args|
466
+ # Post logic here
467
+ # Use skips_post before a command to skip this
468
+ # block on that command only
469
+ end
470
+
471
+ on_error do |exception|
472
+ # Error logic here
473
+ # return false to skip default error handling
474
+ true
475
+ end
476
+
477
+ exit run(ARGV)
data/curlyq.gemspec ADDED
@@ -0,0 +1,27 @@
1
+ # Ensure we require the local version and not one we might have installed already
2
+ require File.join([File.dirname(__FILE__),'lib','curly','version.rb'])
3
+ spec = Gem::Specification.new do |s|
4
+ s.name = 'curlyq'
5
+ s.version = Curly::VERSION
6
+ s.author = 'Brett Terpstra'
7
+ s.email = 'me@brettterpstra.com'
8
+ s.homepage = 'https://brettterpstra.com'
9
+ s.platform = Gem::Platform::RUBY
10
+ s.licenses = 'MIT'
11
+ s.summary = 'A CLI helper for curl and web scraping'
12
+ s.files = `git ls-files`.split("
13
+ ")
14
+ s.require_paths << 'lib'
15
+ s.extra_rdoc_files = ['README.rdoc','curlyq.rdoc']
16
+ s.rdoc_options << '--title' << 'curlyq' << '--main' << 'README.rdoc' << '-ri'
17
+ s.bindir = 'bin'
18
+ s.executables << 'curlyq'
19
+ s.add_development_dependency('rake','~> 0.9.2')
20
+ s.add_development_dependency('rdoc', '~> 4.3')
21
+ s.add_development_dependency('minitest', '~> 5.14')
22
+ s.add_development_dependency('yard', '~> 0.9', '>= 0.9.26')
23
+ s.add_runtime_dependency('gli','~> 2.21.0')
24
+ s.add_runtime_dependency('tty-which','~> 0.5.0')
25
+ s.add_runtime_dependency('nokogiri','~> 1.16.0')
26
+ s.add_runtime_dependency('selenium-webdriver', '~> 4.16.0')
27
+ end