curlyq 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/curlyq CHANGED
@@ -71,7 +71,7 @@ command %i[html curl] do |c|
71
71
  c.switch %i[I info], negatable: false
72
72
 
73
73
  c.desc 'Regurn an array of matches to a CSS or XPath query'
74
- c.flag %i[search]
74
+ c.flag %i[s search]
75
75
 
76
76
  c.desc 'Define a header to send as "key=value"'
77
77
  c.flag %i[h header], multiple: true
@@ -110,25 +110,31 @@ command %i[html curl] do |c|
110
110
  output = []
111
111
 
112
112
  urls.each do |url|
113
- res = Curl::Html.new(url, { browser: options[:browser], fallback: options[:fallback],
114
- headers: headers, headers_only: options[:info],
115
- compressed: options[:compressed], clean: options[:clean],
116
- ignore_local_links: options[:ignore_relative],
117
- ignore_fragment_links: options[:ignore_fragments],
118
- external_links_only: options[:external_links_only] })
113
+ curl_settings = { browser: options[:browser], fallback: options[:fallback],
114
+ headers: headers, headers_only: options[:info],
115
+ compressed: options[:compressed], clean: options[:clean],
116
+ ignore_local_links: options[:ignore_relative],
117
+ ignore_fragment_links: options[:ignore_fragments],
118
+ external_links_only: options[:external_links_only] }
119
+ res = Curl::Html.new(url, curl_settings)
119
120
  res.curl
120
121
 
121
122
  if options[:info]
122
123
  output.push(res.headers)
123
- # print_out(res.headers, global_options[:yaml], raw: options[:raw], pretty: global_options[:pretty])
124
124
  next
125
125
  end
126
126
 
127
127
  if options[:search]
128
- out = res.search(options[:search])
128
+ source = res.search(options[:search], return_source: true)
129
129
 
130
- out = out.dot_query(options[:query]) if options[:query]
131
- output.push(out)
130
+ out = res.parse(source)
131
+
132
+ if options[:query]
133
+ out = out.to_data(url: url, clean: options[:clean]).dot_query(options[:query])
134
+ else
135
+ out = out.to_data
136
+ end
137
+ output.push([out])
132
138
  elsif options[:query]
133
139
  queried = res.to_data.dot_query(options[:query])
134
140
  output.push(queried) if queried
@@ -136,7 +142,7 @@ command %i[html curl] do |c|
136
142
  output.push(res.to_data(url: url))
137
143
  end
138
144
  end
139
-
145
+ output.delete_if(&:nil?)
140
146
  output.delete_if(&:empty?)
141
147
  output = output[0] if output.count == 1
142
148
  output.map! { |o| o[options[:raw].to_sym] } if options[:raw]
@@ -149,13 +155,13 @@ desc 'Save a screenshot of a URL'
149
155
  arg_name 'URL', multiple: true
150
156
  command :screenshot do |c|
151
157
  c.desc 'Type of screenshot to save (full (requires firefox), print, visible)'
152
- c.flag %i[t type], type: ScreenshotType, must_match: /^[fpv].*?$/, default_value: 'full'
158
+ c.flag %i[t type], type: ScreenshotType, must_match: /^[fpv].*?$/, default_value: 'visible'
153
159
 
154
160
  c.desc 'Browser to use (firefox, chrome)'
155
161
  c.flag %i[b browser], type: BrowserType, must_match: /^[fc].*?$/, default_value: 'chrome'
156
162
 
157
163
  c.desc 'File destination'
158
- c.flag %i[o out file]
164
+ c.flag %i[o out file], required: true
159
165
 
160
166
  c.desc 'Define a header to send as key=value'
161
167
  c.flag %i[h header], multiple: true
@@ -164,11 +170,19 @@ command :screenshot do |c|
164
170
  urls = args.join(' ').split(/[, ]+/)
165
171
  headers = break_headers(options[:header])
166
172
 
173
+ type = options[:type]
174
+ browser = options[:browser]
175
+
176
+ type = type.is_a?(Symbol) ? type : type.normalize_screenshot_type
177
+ browser = browser.is_a?(Symbol) ? browser : browser.normalize_browser_type
178
+
179
+ raise 'Full page screen shots only available with Firefox' if type == :full_page && browser != :firefox
180
+
167
181
  urls.each do |url|
168
182
  c = Curl::Html.new(url)
169
183
  c.headers = headers
170
- c.browser = options[:browser]
171
- c.screenshot(options[:out], type: options[:type])
184
+ c.browser = browser
185
+ c.screenshot(options[:out], type: type)
172
186
  end
173
187
  end
174
188
  end
@@ -208,12 +222,26 @@ command :json do |c|
208
222
  headers: res.headers
209
223
  })
210
224
  else
211
- json = json.dot_query(options[:query]) if options[:query]
225
+ if options[:query]
226
+ if options[:query] =~ /^json$/
227
+ res = json
228
+ elsif options[:query] =~ /^json\./
229
+ query = options[:query].sub(/^json\./, '')
230
+ else
231
+ query = options[:query]
232
+ end
233
+
234
+ res = json.dot_query(query)
235
+ else
236
+ res = res.to_data
237
+ end
212
238
 
213
- output.push(json)
239
+ output.push(res)
214
240
  end
215
241
  end
216
242
 
243
+ output = output[0] if output.count == 1
244
+
217
245
  print_out(output, global_options[:yaml], pretty: global_options[:pretty])
218
246
  end
219
247
  end
@@ -221,12 +249,18 @@ end
221
249
  desc 'Extract contents between two regular expressions'
222
250
  arg_name 'URL', multiple: true
223
251
  command :extract do |c|
224
- c.desc 'Text before extraction, parsed as regex'
252
+ c.desc 'Text before extraction'
225
253
  c.flag %i[b before]
226
254
 
227
- c.desc 'Text after extraction, parsed as regex'
255
+ c.desc 'Text after extraction'
228
256
  c.flag %i[a after]
229
257
 
258
+ c.desc 'Process before/after strings as regular expressions'
259
+ c.switch %i[r regex]
260
+
261
+ c.desc 'Include the before/after matches in the result'
262
+ c.switch %i[i include]
263
+
230
264
  c.desc 'Define a header to send as key=value'
231
265
  c.flag %i[h header], multiple: true
232
266
 
@@ -249,7 +283,15 @@ command :extract do |c|
249
283
  res = Curl::Html.new(url, { headers: headers, headers_only: false,
250
284
  compressed: options[:compressed], clean: options[:clean] })
251
285
  res.curl
252
- extracted = res.extract(options[:before], options[:after])
286
+ if options[:regex]
287
+ before = Regexp.new(options[:before])
288
+ after = Regexp.new(options[:after])
289
+ else
290
+ before = /#{Regexp.escape(options[:before])}/
291
+ after = /#{Regexp.escape(options[:after])}/
292
+ end
293
+
294
+ extracted = res.extract(before, after, inclusive: options[:include])
253
295
  extracted.strip_tags! if options[:strip]
254
296
  output.concat(extracted)
255
297
  end
@@ -262,10 +304,10 @@ desc 'Extract all instances of a tag'
262
304
  arg_name 'URL', multiple: true
263
305
  command :tags do |c|
264
306
  c.desc 'Define a header to send as key=value'
265
- c.flag %i[h header], multiple: true
307
+ c.flag %i[h header], multiple: true, arg_name: 'KEY=VAL'
266
308
 
267
309
  c.desc 'Specify a tag to collect'
268
- c.flag %i[t tag], multiple: true
310
+ c.flag %i[t tag], multiple: true, arg_name: 'TAG'
269
311
 
270
312
  c.desc 'Expect compressed results'
271
313
  c.switch %i[c compressed]
@@ -273,8 +315,14 @@ command :tags do |c|
273
315
  c.desc 'Remove extra whitespace from results'
274
316
  c.switch %i[clean]
275
317
 
276
- c.desc 'CSS/XPath query'
277
- c.flag %i[q query search]
318
+ c.desc 'Output the HTML source of the results'
319
+ c.switch %i[source html]
320
+
321
+ c.desc 'Dot syntax query to filter results'
322
+ c.flag %i[q query filter], arg_name: 'DOT_SYNTAX'
323
+
324
+ c.desc 'Regurn an array of matches to a CSS or XPath query'
325
+ c.flag %i[search], arg_name: 'CSS/XPATH'
278
326
 
279
327
  c.action do |global_options, options, args|
280
328
  urls = args.join(' ').split(/[, ]+/)
@@ -286,9 +334,17 @@ command :tags do |c|
286
334
  res = Curl::Html.new(url, { headers: headers, headers_only: options[:headers],
287
335
  compressed: options[:compressed], clean: options[:clean] })
288
336
  res.curl
337
+
289
338
  output = []
290
339
  if options[:search]
291
- output = res.tags.search(options[:search])
340
+ out = res.search(options[:search])
341
+
342
+ out = out.dot_query(options[:query]) if options[:query]
343
+ output.push(out)
344
+ elsif options[:query]
345
+ query = options[:query] =~ /^links/ ? options[:query] : "links#{options[:query]}"
346
+
347
+ output = res.to_data.dot_query(query)
292
348
  elsif tags.count.positive?
293
349
  tags.each { |tag| output.concat(res.tags(tag)) }
294
350
  else
@@ -296,7 +352,13 @@ command :tags do |c|
296
352
  end
297
353
  end
298
354
 
299
- print_out(output, global_options[:yaml], pretty: global_options[:pretty])
355
+ output = output[0] if output.count == 1
356
+
357
+ if options[:source]
358
+ puts output.to_html
359
+ else
360
+ print_out(output, global_options[:yaml], pretty: global_options[:pretty])
361
+ end
300
362
  end
301
363
  end
302
364
 
@@ -312,6 +374,9 @@ command :images do |c|
312
374
  c.desc 'Remove extra whitespace from results'
313
375
  c.switch %i[clean]
314
376
 
377
+ c.desc 'Filter output using dot-syntax path'
378
+ c.flag %i[q query filter]
379
+
315
380
  c.desc 'Define a header to send as key=value'
316
381
  c.flag %i[h header], multiple: true
317
382
 
@@ -326,7 +391,15 @@ command :images do |c|
326
391
  urls.each do |url|
327
392
  res = Curl::Html.new(url, { compressed: options[:compressed], clean: options[:clean] })
328
393
  res.curl
329
- output.concat(res.images(types: types))
394
+
395
+ res = res.images(types: types)
396
+
397
+ if options[:query]
398
+ query = options[:query] =~ /^images/ ? options[:query] : "images#{options[:query]}"
399
+ res = { images: res }.dot_query(query)
400
+ end
401
+
402
+ output.concat(res)
330
403
  end
331
404
 
332
405
  print_out(output, global_options[:yaml], pretty: global_options[:pretty])
@@ -367,7 +440,7 @@ command :links do |c|
367
440
 
368
441
  if options[:query]
369
442
  query = options[:query] =~ /^links/ ? options[:query] : "links#{options[:query]}"
370
- queried = { links: res.to_data[:links] }.dot_query(query)
443
+ queried = res.to_data.dot_query(query)
371
444
  output.concat(queried) if queried
372
445
  else
373
446
  output.concat(res.body_links)
@@ -414,7 +487,7 @@ desc %(Scrape a page using a web browser, for dynamic (JS) pages. Be sure to hav
414
487
  arg_name 'URL', multiple: true
415
488
  command :scrape do |c|
416
489
  c.desc 'Browser to use (firefox, chrome)'
417
- c.flag %i[b browser], type: BrowserType
490
+ c.flag %i[b browser], type: BrowserType, required: true
418
491
 
419
492
  c.desc 'Regurn an array of matches to a CSS or XPath query'
420
493
  c.flag %i[search]
@@ -437,30 +510,19 @@ command :scrape do |c|
437
510
  output = []
438
511
 
439
512
  urls.each do |url|
440
- driver = Selenium::WebDriver.for options[:browser]
441
- begin
442
- driver.get url
443
- res = driver.page_source
444
-
445
- res = Curl::Html.new(nil, { source: res, clean: options[:clean] })
446
- res.curl
447
- if options[:search]
448
- out = res.search(options[:search])
449
-
450
- out = out.dot_query(options[:query]) if options[:query]
451
- output.push(out)
452
- elsif options[:query]
453
- queried = res.to_data(url: url).dot_query(options[:query])
454
- output = queried if queried
455
- else
456
- output.push(res.to_data(url: url))
457
- end
513
+ res = Curl::Html.new(url, { browser: options[:browser], clean: options[:clean] })
514
+ res.curl
458
515
 
459
- # elements = driver.find_elements(css: options[:query])
516
+ if options[:search]
517
+ out = res.search(options[:search])
460
518
 
461
- # elements.each { |e| output.push(e.text.strip) }
462
- ensure
463
- driver.quit
519
+ out = out.dot_query(options[:query]) if options[:query]
520
+ output.push(out)
521
+ elsif options[:query]
522
+ queried = res.to_data(url: url).dot_query(options[:query])
523
+ output.push(queried) if queried
524
+ else
525
+ output.push(res.to_data(url: url))
464
526
  end
465
527
  end
466
528
 
data/curlyq.gemspec CHANGED
@@ -16,10 +16,14 @@ spec = Gem::Specification.new do |s|
16
16
  s.rdoc_options << '--title' << 'curlyq' << '--main' << 'README.rdoc' << '-ri'
17
17
  s.bindir = 'bin'
18
18
  s.executables << 'curlyq'
19
- s.add_development_dependency('rake','~> 0.9.2')
20
- s.add_development_dependency('rdoc', '~> 4.3')
21
- s.add_development_dependency('minitest', '~> 5.14')
19
+ s.add_development_dependency('rake','~> 13.0', '>= 13.0.1')
20
+ s.add_development_dependency('rdoc', '~> 6.3.1')
21
+ s.add_development_dependency('test-unit', '~> 3.4.4')
22
22
  s.add_development_dependency('yard', '~> 0.9', '>= 0.9.26')
23
+ s.add_development_dependency('tty-spinner', '~> 0.9', '>= 0.9.3')
24
+ s.add_development_dependency('tty-progressbar', '~> 0.18', '>= 0.18.2')
25
+ s.add_development_dependency('pastel', '~> 0.8.0')
26
+ s.add_development_dependency('parallel_tests', '~> 3.7', '>= 3.7.3')
23
27
  s.add_runtime_dependency('gli','~> 2.21.0')
24
28
  s.add_runtime_dependency('tty-which','~> 0.5.0')
25
29
  s.add_runtime_dependency('nokogiri','~> 1.16.0')
data/lib/curly/array.rb CHANGED
@@ -66,69 +66,94 @@ class ::Array
66
66
  replace dedup_links
67
67
  end
68
68
 
69
+ #---------------------------------------------------------
70
+ ## Run a query on array elements
69
71
  ##
70
- ## Convert and execute a dot-syntax query on the array
72
+ ## @param path [String] dot.syntax path to compare
71
73
  ##
72
- ## @param path [String] The dot-syntax path
73
- ##
74
- ## @return [Array] Matching elements
74
+ ## @return [Array] elements matching dot query
75
75
  ##
76
76
  def dot_query(path)
77
- output = []
78
- if path =~ /^\[([\d+.])\]\.?/
79
- int = Regexp.last_match(1)
80
- path.sub!(/^\[[\d.]+\]\.?/, '')
81
- items = self[eval(int)]
82
- else
83
- items = self
77
+ filter! do |tag|
78
+ r = tag.dot_query(path)
79
+ if r.is_a?(Array)
80
+ r.count.positive?
81
+ else
82
+ r
83
+ end
84
84
  end
85
85
 
86
- if items.is_a? Hash
87
- output = items.dot_query(path)
88
- else
89
- items.each do |item|
90
- res = item.is_a?(Hash) ? item.stringify_keys : item
91
- out = []
92
- q = path.split(/(?<![\d.])\./)
93
- q.each do |pth|
94
- el = Regexp.last_match(1) if pth =~ /\[([0-9,.]+)\]/
95
- pth.sub!(/\[([0-9,.]+)\]/, '')
96
- ats = []
97
- at = []
98
- while pth =~ /\[[+&,]?\w+ *[\^*$=<>]=? *\w+/
99
- m = pth.match(/\[(?<com>[,+&])? *(?<key>\w+) *(?<op>[\^*$=<>]{1,2}) *(?<val>\w+) */)
100
- comp = [m['key'], m['op'], m['val']]
101
- case m['com']
102
- when ','
103
- ats.push(comp)
104
- at = []
105
- else
106
- at.push(comp)
107
- end
108
-
109
- pth.sub!(/\[(?<com>[,&+])? *(?<key>\w+) *(?<op>[\^*$=<>]{1,2}) *(?<val>\w+)/, '[')
110
- end
111
- ats.push(at) unless at.empty?
112
- pth.sub!(/\[\]/, '')
113
-
114
- return false if el.nil? && ats.empty? && !res.key?(pth)
115
-
116
- res = res[pth] unless pth.empty?
117
-
118
- while ats.count.positive?
119
- atr = ats.shift
120
-
121
- keepers = res.filter do |r|
122
- evaluate_comp(r, atr)
123
- end
124
- out.concat(keepers)
125
- end
126
-
127
- out = out[eval(el)] if out.is_a?(Array) && el =~ /^[\d.,]+$/
86
+ return self
87
+ end
88
+
89
+ def to_html
90
+ map { |el| el.to_html }
91
+ end
92
+
93
+ ##
94
+ ## Test if a tag contains an attribute matching filter queries
95
+ ##
96
+ ## @param tag_name [String] The tag name
97
+ ## @param classes [String] The classes to match
98
+ ## @param id [String] The id attribute to
99
+ ## match
100
+ ## @param attribute [String] The attribute
101
+ ## @param operator [String] The operator, <>= *=
102
+ ## $= ^=
103
+ ## @param value [String] The value to match
104
+ ## @param descendant [Boolean] Check descendant tags
105
+ ##
106
+ def tag_match(tag_name, classes, id, attribute, operator, value, descendant: false)
107
+ tag = self
108
+ keep = true
109
+
110
+ keep = false if tag_name && !tag['tag'] =~ /^#{tag_name}$/i
111
+
112
+ if tag.key?('attrs') && tag['attrs']
113
+ if keep && id
114
+ tag_id = tag['attrs'].filter { |a| a['key'] == 'id' }.first['value']
115
+ keep = tag_id && tag_id =~ /#{id}/i
116
+ end
117
+
118
+ if keep && classes
119
+ cls = tag['attrs'].filter { |a| a['key'] == 'class' }.first
120
+ if cls
121
+ all = true
122
+ classes.each { |c| all = cls['value'].include?(c) }
123
+ keep = all
124
+ else
125
+ keep = false
128
126
  end
129
- output.push(out)
130
127
  end
128
+
129
+ if keep && attribute
130
+ attributes = tag['attrs'].filter { |a| a['key'] =~ /^#{attribute}$/i }
131
+ any = false
132
+ attributes.each do |a|
133
+ break if any
134
+
135
+ any = case operator
136
+ when /^*/
137
+ a['value'] =~ /#{value}/i
138
+ when /^\^/
139
+ a['value'] =~ /^#{value}/i
140
+ when /^\$/
141
+ a['value'] =~ /#{value}$/i
142
+ else
143
+ a['value'] =~ /^#{value}$/i
144
+ end
145
+ end
146
+ keep = any
147
+ end
148
+ end
149
+
150
+ return false if descendant && !keep
151
+
152
+ if !descendant && tag.key?('tags')
153
+ tags = tag['tags'].filter { |t| t.tag_match(tag_name, classes, id, attribute, operator, value) }
154
+ tags.count.positive?
155
+ else
156
+ keep
131
157
  end
132
- output
133
158
  end
134
159
  end