curlyq 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/bin/curlyq ADDED
@@ -0,0 +1,477 @@
1
+ #!/usr/bin/env ruby
2
+ require 'gli'
3
+ require 'curly'
4
+ require 'curly/curl'
5
+
6
+ include GLI::App
7
+
8
+ program_desc 'A scriptable interface to curl'
9
+
10
+ version Curly::VERSION
11
+
12
+ subcommand_option_handling :normal
13
+ arguments :strict
14
+
15
+ ImageType = Class.new(Symbol)
16
+ accept ImageType do |value|
17
+ value.normalize_image_type(:all)
18
+ end
19
+
20
+ BrowserType = Class.new(Symbol)
21
+ accept BrowserType do |value|
22
+ value.normalize_browser_type(:none)
23
+ end
24
+
25
+ ScreenshotType = Class.new(Symbol)
26
+ accept ScreenshotType do |value|
27
+ value.normalize_screenshot_type(:full_page)
28
+ end
29
+
30
+ desc 'Output YAML instead of json'
31
+ switch %i[y yaml]
32
+
33
+ desc 'Output "pretty" JSON'
34
+ switch %i[pretty], default_value: true, negatable: true
35
+
36
+ # TODO: Post method, html and json with --data flags, accept key=value and files
37
+ # TODO: Handle binary responses, deal gracefully with compressed data
38
+ # TODO: File uploads?
39
+
40
+ def self.break_headers(headers)
41
+ out = {}
42
+ headers.each do |h|
43
+ m = h.match(/(?<key>[^=]+)=(?<value>.*?)$/)
44
+ out[m['key'].strip] = m['value'].strip
45
+ end
46
+ out
47
+ end
48
+
49
+ def self.print_out(output, yaml, raw: false, pretty: true)
50
+ output = output.to_data if output.respond_to?(:to_data)
51
+ # Was intended to flatten single responses, but not getting an array back is unpredictable
52
+ # output = output[0] if output&.is_a?(Array) && output.count == 1
53
+ if output.is_a?(String)
54
+ print output
55
+ elsif raw
56
+ output = output.join("\n") if output.is_a?(Array)
57
+ print output
58
+ else
59
+ if yaml
60
+ print YAML.dump(output)
61
+ else
62
+ print pretty ? JSON.pretty_generate(output) : JSON.generate(output)
63
+ end
64
+ end
65
+ end
66
+
67
+ desc 'Curl URL and output its elements, multiple URLs allowed'
68
+ arg_name 'URL', multiple: true
69
+ command %i[html curl] do |c|
70
+ c.desc 'Only retrieve headers/info'
71
+ c.switch %i[I info], negatable: false
72
+
73
+ c.desc 'Regurn an array of matches to a CSS or XPath query'
74
+ c.flag %i[search]
75
+
76
+ c.desc 'Define a header to send as "key=value"'
77
+ c.flag %i[h header], multiple: true
78
+
79
+ c.desc 'Use a browser to retrieve a dynamic web page (firefox, chrome)'
80
+ c.flag %i[b browser], type: BrowserType, must_match: /^[fc].*?$/
81
+
82
+ c.desc %(If curl doesn't work, use a fallback browser (firefox, chrome))
83
+ c.flag %i[f fallback], type: BrowserType, must_match: /^[fc].*?$/
84
+
85
+ c.desc 'Expect compressed results'
86
+ c.switch %i[c compressed], negatable: false
87
+
88
+ c.desc 'Remove extra whitespace from results'
89
+ c.switch %i[clean]
90
+
91
+ c.desc 'Filter output using dot-syntax path'
92
+ c.flag %i[q query filter]
93
+
94
+ c.desc 'Output a raw value for a key'
95
+ c.flag %i[r raw]
96
+
97
+ c.desc 'Ignore relative hrefs when gathering content links'
98
+ c.switch %i[ignore_relative], negatable: true
99
+
100
+ c.desc 'Ignore fragment hrefs when gathering content links'
101
+ c.switch %i[ignore_fragments], negatable: true
102
+
103
+ c.desc 'Only gather external links'
104
+ c.switch %i[x external_links_only], default_value: false, negatable: false
105
+
106
+ c.action do |global_options, options, args|
107
+ urls = args.join(' ').split(/[, ]+/)
108
+ headers = break_headers(options[:header])
109
+
110
+ output = []
111
+
112
+ urls.each do |url|
113
+ res = Curl::Html.new(url, browser: options[:browser], fallback: options[:fallback],
114
+ headers: headers, headers_only: options[:info],
115
+ compressed: options[:compressed], clean: options[:clean],
116
+ ignore_local_links: options[:ignore_relative],
117
+ ignore_fragment_links: options[:ignore_fragments],
118
+ external_links_only: options[:external_links_only])
119
+
120
+ if options[:info]
121
+ output.push(res.headers)
122
+ # print_out(res.headers, global_options[:yaml], raw: options[:raw], pretty: global_options[:pretty])
123
+ next
124
+ end
125
+
126
+ if options[:search]
127
+ out = res.search(options[:search])
128
+
129
+ out = out.dot_query(options[:query]) if options[:query]
130
+ output.push(out)
131
+ elsif options[:query]
132
+ queried = res.to_data.dot_query(options[:query])
133
+ output.push(queried) if queried
134
+ else
135
+ output.push(res.to_data(url: url))
136
+ end
137
+ end
138
+
139
+ output.delete_if(&:empty?)
140
+ output = output[0] if output.count == 1
141
+ output.map! { |o| o[options[:raw].to_sym] } if options[:raw]
142
+
143
+ print_out(output, global_options[:yaml], raw: options[:raw], pretty: global_options[:pretty])
144
+ end
145
+ end
146
+
147
+ desc 'Save a screenshot of a URL'
148
+ arg_name 'URL', multiple: true
149
+ command :screenshot do |c|
150
+ c.desc 'Type of screenshot to save (full (requires firefox), print, visible)'
151
+ c.flag %i[t type], type: ScreenshotType, must_match: /^[fpv].*?$/, default_value: 'full'
152
+
153
+ c.desc 'Browser to use (firefox, chrome)'
154
+ c.flag %i[b browser], type: BrowserType, must_match: /^[fc].*?$/, default_value: 'chrome'
155
+
156
+ c.desc 'File destination'
157
+ c.flag %i[o out file]
158
+
159
+ c.action do |_, options, args|
160
+ urls = args.join(' ').split(/[, ]+/)
161
+
162
+ urls.each do |url|
163
+ c = Curl::Html.new(url)
164
+ c.screenshot(options[:out], browser: options[:browser], type: options[:type])
165
+ end
166
+ end
167
+ end
168
+
169
+ desc 'Get a JSON response from a URL, multiple URLs allowed'
170
+ arg_name 'URL', multiple: true
171
+ command :json do |c|
172
+ c.desc 'Define a header to send as key=value'
173
+ c.flag %i[h header], multiple: true
174
+
175
+ c.desc 'Expect compressed results'
176
+ c.switch %i[c compressed]
177
+
178
+ c.desc 'Filter output using dot-syntax path'
179
+ c.flag %i[q query filter]
180
+
181
+ c.action do |global_options, options, args|
182
+ urls = args.join(' ').split(/[, ]+/)
183
+ headers = break_headers(options[:header])
184
+
185
+ output = []
186
+
187
+ urls.each do |url|
188
+ res = Curl::Json.new(url, headers: headers, compressed: options[:compressed], symbolize_names: false)
189
+
190
+ json = res.json
191
+
192
+ if json.nil?
193
+ output.push({
194
+ status: 'error parsing JSON',
195
+ url: res.url,
196
+ code: res.code,
197
+ headers: res.headers
198
+ })
199
+ else
200
+ json = json.dot_query(options[:query]) if options[:query]
201
+
202
+ output.push(json)
203
+ end
204
+ end
205
+
206
+ print_out(output, global_options[:yaml], pretty: global_options[:pretty])
207
+ end
208
+ end
209
+
210
+ desc 'Extract contents between two regular expressions'
211
+ arg_name 'URL', multiple: true
212
+ command :extract do |c|
213
+ c.desc 'Text before extraction, parsed as regex'
214
+ c.flag %i[b before]
215
+
216
+ c.desc 'Text after extraction, parsed as regex'
217
+ c.flag %i[a after]
218
+
219
+ c.desc 'Define a header to send as key=value'
220
+ c.flag %i[h header], multiple: true
221
+
222
+ c.desc 'Expect compressed results'
223
+ c.switch %i[c compressed]
224
+
225
+ c.desc 'Remove extra whitespace from results'
226
+ c.switch %i[clean]
227
+
228
+ c.desc 'Strip HTML tags from results'
229
+ c.switch %i[strip]
230
+
231
+ c.action do |global_options,options,args|
232
+ urls = args.join(' ').split(/[, ]+/)
233
+ headers = break_headers(options[:header])
234
+
235
+ output = []
236
+
237
+ urls.each do |url|
238
+ res = Curl::Html.new(url, headers: headers, headers_only: false,
239
+ compressed: options[:compressed], clean: options[:clean])
240
+ extracted = res.extract(options[:before], options[:after])
241
+ extracted.strip_tags! if options[:strip]
242
+ output.concat(extracted)
243
+ end
244
+
245
+ print_out(output, global_options[:yaml], pretty: global_options[:pretty])
246
+ end
247
+ end
248
+
249
+ desc 'Extract all instances of a tag'
250
+ arg_name 'URL', multiple: true
251
+ command :tags do |c|
252
+ c.desc 'Define a header to send as key=value'
253
+ c.flag %i[h header], multiple: true
254
+
255
+ c.desc 'Specify a tag to collect'
256
+ c.flag %i[t tag], multiple: true
257
+
258
+ c.desc 'Expect compressed results'
259
+ c.switch %i[c compressed]
260
+
261
+ c.desc 'Remove extra whitespace from results'
262
+ c.switch %i[clean]
263
+
264
+ c.desc 'CSS/XPath query'
265
+ c.flag %i[q query search]
266
+
267
+ c.action do |global_options, options, args|
268
+ urls = args.join(' ').split(/[, ]+/)
269
+ headers = break_headers(options[:header])
270
+ tags = options[:tag].join(' ').split(/[, ]+/)
271
+ output = []
272
+
273
+ urls.each do |url|
274
+ res = Curl::Html.new(url, headers: headers, headers_only: options[:headers],
275
+ compressed: options[:compressed], clean: options[:clean])
276
+ output = []
277
+ if options[:search]
278
+ output = res.tags.search(options[:search])
279
+ elsif tags.count.positive?
280
+ tags.each { |tag| output.concat(res.tags(tag)) }
281
+ else
282
+ output.concat(res.tags)
283
+ end
284
+ end
285
+
286
+ print_out(output, global_options[:yaml], pretty: global_options[:pretty])
287
+ end
288
+ end
289
+
290
+ desc 'Extract all images from a URL'
291
+ arg_name 'URL', multiple: true
292
+ command :images do |c|
293
+ c.desc 'Type of images to return (img, srcset, opengraph, all)'
294
+ c.flag %i[t type], multiple: true, type: ImageType, default_value: ['all']
295
+
296
+ c.desc 'Expect compressed results'
297
+ c.switch %i[c compressed]
298
+
299
+ c.desc 'Remove extra whitespace from results'
300
+ c.switch %i[clean]
301
+
302
+ c.action do |global_options, options, args|
303
+ urls = args.join(' ').split(/[, ]+/)
304
+
305
+ output = []
306
+
307
+ types = options[:type].join(' ').split(/[ ,]+/).map(&:normalize_image_type)
308
+
309
+ urls.each do |url|
310
+ res = Curl::Html.new(url, compressed: options[:compressed], clean: options[:clean])
311
+ output.concat(res.images(types: types))
312
+ end
313
+
314
+ print_out(output, global_options[:yaml], pretty: global_options[:pretty])
315
+ end
316
+ end
317
+
318
+ desc %(Return all links on a URL's page)
319
+ arg_name 'URL', multiple: true
320
+ command :links do |c|
321
+ c.desc 'Ignore relative hrefs when gathering content links'
322
+ c.switch %i[ignore_relative], negatable: true
323
+
324
+ c.desc 'Ignore fragment hrefs when gathering content links'
325
+ c.switch %i[ignore_fragments], negatable: true
326
+
327
+ c.desc 'Only gather external links'
328
+ c.switch %i[x external_links_only], default_value: false, negatable: false
329
+
330
+ c.desc 'Filter output using dot-syntax path'
331
+ c.flag %i[q query filter]
332
+
333
+ c.desc 'Filter out duplicate links, preserving only first one'
334
+ c.switch %i[d dedup], negatable: true
335
+
336
+ c.action do |global_options, options, args|
337
+ urls = args.join(' ').split(/[, ]+/)
338
+
339
+ output = []
340
+
341
+ urls.each do |url|
342
+ res = Curl::Html.new(url, compressed: options[:compressed], clean: options[:clean],
343
+ ignore_local_links: options[:ignore_relative],
344
+ ignore_fragment_links: options[:ignore_fragments],
345
+ external_links_only: options[:external_links_only])
346
+
347
+ if options[:query]
348
+ query = options[:query] =~ /^links/ ? options[:query] : "links#{options[:query]}"
349
+ queried = { links: res.to_data[:links] }.dot_query(query)
350
+ output.concat(queried) if queried
351
+ else
352
+ output.concat(res.body_links)
353
+ end
354
+ end
355
+
356
+ output.dedup_links! if options[:dedup]
357
+
358
+ print_out(output, global_options[:yaml], pretty: global_options[:pretty])
359
+ end
360
+ end
361
+
362
+ desc %(Return all <head> links on URL's page)
363
+ arg_name 'URL', multiple: true
364
+ command :headlinks do |c|
365
+ c.desc 'Filter output using dot-syntax path'
366
+ c.flag %i[q query filter]
367
+
368
+ c.action do |global_options, options, args|
369
+ urls = args.join(' ').split(/[, ]+/)
370
+
371
+ output = []
372
+
373
+ urls.each do |url|
374
+ res = Curl::Html.new(url, compressed: options[:compressed], clean: options[:clean])
375
+
376
+ if options[:query]
377
+ query = options[:query] =~ /^links/ ? options[:query] : "links#{options[:query]}"
378
+ queried = { links: res.to_data[:meta_links] }.dot_query(query)
379
+ output.concat(queried) if queried
380
+ else
381
+ output.push(res.to_data[:meta_links])
382
+ end
383
+ end
384
+
385
+ output = output[0] if output.count == 1
386
+
387
+ print_out(output, global_options[:yaml], pretty: global_options[:pretty])
388
+ end
389
+ end
390
+
391
+ desc %(Scrape a page using a web browser, for dynamic (JS) pages. Be sure to have the selected --browser installed.)
392
+ arg_name 'URL', multiple: true
393
+ command :scrape do |c|
394
+ c.desc 'Browser to use (firefox, chrome)'
395
+ c.flag %i[b browser], type: BrowserType
396
+
397
+ c.desc 'Regurn an array of matches to a CSS or XPath query'
398
+ c.flag %i[search]
399
+
400
+ c.desc 'Define a header to send as "key=value"'
401
+ c.flag %i[h header], multiple: true
402
+
403
+ c.desc 'Remove extra whitespace from results'
404
+ c.switch %i[clean]
405
+
406
+ c.desc 'Filter output using dot-syntax path'
407
+ c.flag %i[q query filter]
408
+
409
+ c.desc 'Output a raw value for a key'
410
+ c.flag %i[r raw]
411
+
412
+ c.action do |global_options, options, args|
413
+ urls = args.join(' ').split(/[, ]+/)
414
+
415
+ output = []
416
+
417
+ urls.each do |url|
418
+ driver = Selenium::WebDriver.for options[:browser]
419
+ begin
420
+ driver.get url
421
+ res = driver.page_source
422
+
423
+ res = Curl::Html.new(nil, source: res, clean: options[:clean])
424
+ if options[:search]
425
+ out = res.search(options[:search])
426
+
427
+ out = out.dot_query(options[:query]) if options[:query]
428
+ output.push(out)
429
+ elsif options[:query]
430
+ queried = res.to_data(url: url).dot_query(options[:query])
431
+ output = queried if queried
432
+ else
433
+ output.push(res.to_data(url: url))
434
+ end
435
+
436
+ # elements = driver.find_elements(css: options[:query])
437
+
438
+ # elements.each { |e| output.push(e.text.strip) }
439
+ ensure
440
+ driver.quit
441
+ end
442
+ end
443
+
444
+ output.delete_if(&:empty?)
445
+
446
+ output = output[0] if output.count == 1
447
+
448
+ if options[:raw]
449
+ output.map! { |o| o[options[:raw].to_sym] }
450
+ end
451
+
452
+ print_out(output, global_options[:yaml], raw: options[:raw], pretty: global_options[:pretty])
453
+ end
454
+ end
455
+
456
+ pre do |global, command, options, args|
457
+ # Pre logic here
458
+ # Return true to proceed; false to abort and not call the
459
+ # chosen command
460
+ # Use skips_pre before a command to skip this block
461
+ # on that command only
462
+ true
463
+ end
464
+
465
+ post do |global, command, options, args|
466
+ # Post logic here
467
+ # Use skips_post before a command to skip this
468
+ # block on that command only
469
+ end
470
+
471
+ on_error do |exception|
472
+ # Error logic here
473
+ # return false to skip default error handling
474
+ true
475
+ end
476
+
477
+ exit run(ARGV)
data/curlyq.gemspec ADDED
@@ -0,0 +1,27 @@
1
+ # Ensure we require the local version and not one we might have installed already
2
+ require File.join([File.dirname(__FILE__),'lib','curly','version.rb'])
3
+ spec = Gem::Specification.new do |s|
4
+ s.name = 'curlyq'
5
+ s.version = Curly::VERSION
6
+ s.author = 'Brett Terpstra'
7
+ s.email = 'me@brettterpstra.com'
8
+ s.homepage = 'https://brettterpstra.com'
9
+ s.platform = Gem::Platform::RUBY
10
+ s.licenses = 'MIT'
11
+ s.summary = 'A CLI helper for curl and web scraping'
12
+ s.files = `git ls-files`.split("
13
+ ")
14
+ s.require_paths << 'lib'
15
+ s.extra_rdoc_files = ['README.rdoc','curlyq.rdoc']
16
+ s.rdoc_options << '--title' << 'curlyq' << '--main' << 'README.rdoc' << '-ri'
17
+ s.bindir = 'bin'
18
+ s.executables << 'curlyq'
19
+ s.add_development_dependency('rake','~> 0.9.2')
20
+ s.add_development_dependency('rdoc', '~> 4.3')
21
+ s.add_development_dependency('minitest', '~> 5.14')
22
+ s.add_development_dependency('yard', '~> 0.9', '>= 0.9.26')
23
+ s.add_runtime_dependency('gli','~> 2.21.0')
24
+ s.add_runtime_dependency('tty-which','~> 0.5.0')
25
+ s.add_runtime_dependency('nokogiri','~> 1.16.0')
26
+ s.add_runtime_dependency('selenium-webdriver', '~> 4.16.0')
27
+ end