curlyq 0.0.4 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.irbrc +4 -0
- data/CHANGELOG.md +36 -0
- data/Gemfile.lock +30 -7
- data/README.md +260 -17
- data/Rakefile +32 -4
- data/bin/curlyq +114 -52
- data/curlyq.gemspec +7 -3
- data/lib/curly/array.rb +80 -55
- data/lib/curly/curl/html.rb +71 -54
- data/lib/curly/hash.rb +59 -7
- data/lib/curly/version.rb +1 -1
- data/src/_README.md +239 -1
- data/test/curlyq_extract_test.rb +43 -0
- data/test/curlyq_headlinks_test.rb +32 -0
- data/test/curlyq_html_test.rb +25 -0
- data/test/curlyq_images_test.rb +27 -0
- data/test/curlyq_json_test.rb +33 -0
- data/test/curlyq_links_test.rb +20 -0
- data/test/curlyq_scrape_test.rb +22 -0
- data/test/curlyq_tags_test.rb +31 -0
- data/test/helpers/curlyq-helpers.rb +29 -0
- data/test/helpers/fake_std_out.rb +30 -0
- data/test/helpers/threaded_tests.rb +182 -0
- data/test/test_helper.rb +7 -2
- metadata +101 -9
data/bin/curlyq
CHANGED
@@ -71,7 +71,7 @@ command %i[html curl] do |c|
|
|
71
71
|
c.switch %i[I info], negatable: false
|
72
72
|
|
73
73
|
c.desc 'Regurn an array of matches to a CSS or XPath query'
|
74
|
-
c.flag %i[search]
|
74
|
+
c.flag %i[s search]
|
75
75
|
|
76
76
|
c.desc 'Define a header to send as "key=value"'
|
77
77
|
c.flag %i[h header], multiple: true
|
@@ -110,25 +110,31 @@ command %i[html curl] do |c|
|
|
110
110
|
output = []
|
111
111
|
|
112
112
|
urls.each do |url|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
113
|
+
curl_settings = { browser: options[:browser], fallback: options[:fallback],
|
114
|
+
headers: headers, headers_only: options[:info],
|
115
|
+
compressed: options[:compressed], clean: options[:clean],
|
116
|
+
ignore_local_links: options[:ignore_relative],
|
117
|
+
ignore_fragment_links: options[:ignore_fragments],
|
118
|
+
external_links_only: options[:external_links_only] }
|
119
|
+
res = Curl::Html.new(url, curl_settings)
|
119
120
|
res.curl
|
120
121
|
|
121
122
|
if options[:info]
|
122
123
|
output.push(res.headers)
|
123
|
-
# print_out(res.headers, global_options[:yaml], raw: options[:raw], pretty: global_options[:pretty])
|
124
124
|
next
|
125
125
|
end
|
126
126
|
|
127
127
|
if options[:search]
|
128
|
-
|
128
|
+
source = res.search(options[:search], return_source: true)
|
129
129
|
|
130
|
-
out =
|
131
|
-
|
130
|
+
out = res.parse(source)
|
131
|
+
|
132
|
+
if options[:query]
|
133
|
+
out = out.to_data(url: url, clean: options[:clean]).dot_query(options[:query])
|
134
|
+
else
|
135
|
+
out = out.to_data
|
136
|
+
end
|
137
|
+
output.push([out])
|
132
138
|
elsif options[:query]
|
133
139
|
queried = res.to_data.dot_query(options[:query])
|
134
140
|
output.push(queried) if queried
|
@@ -136,7 +142,7 @@ command %i[html curl] do |c|
|
|
136
142
|
output.push(res.to_data(url: url))
|
137
143
|
end
|
138
144
|
end
|
139
|
-
|
145
|
+
output.delete_if(&:nil?)
|
140
146
|
output.delete_if(&:empty?)
|
141
147
|
output = output[0] if output.count == 1
|
142
148
|
output.map! { |o| o[options[:raw].to_sym] } if options[:raw]
|
@@ -149,13 +155,13 @@ desc 'Save a screenshot of a URL'
|
|
149
155
|
arg_name 'URL', multiple: true
|
150
156
|
command :screenshot do |c|
|
151
157
|
c.desc 'Type of screenshot to save (full (requires firefox), print, visible)'
|
152
|
-
c.flag %i[t type], type: ScreenshotType, must_match: /^[fpv].*?$/, default_value: '
|
158
|
+
c.flag %i[t type], type: ScreenshotType, must_match: /^[fpv].*?$/, default_value: 'visible'
|
153
159
|
|
154
160
|
c.desc 'Browser to use (firefox, chrome)'
|
155
161
|
c.flag %i[b browser], type: BrowserType, must_match: /^[fc].*?$/, default_value: 'chrome'
|
156
162
|
|
157
163
|
c.desc 'File destination'
|
158
|
-
c.flag %i[o out file]
|
164
|
+
c.flag %i[o out file], required: true
|
159
165
|
|
160
166
|
c.desc 'Define a header to send as key=value'
|
161
167
|
c.flag %i[h header], multiple: true
|
@@ -164,11 +170,19 @@ command :screenshot do |c|
|
|
164
170
|
urls = args.join(' ').split(/[, ]+/)
|
165
171
|
headers = break_headers(options[:header])
|
166
172
|
|
173
|
+
type = options[:type]
|
174
|
+
browser = options[:browser]
|
175
|
+
|
176
|
+
type = type.is_a?(Symbol) ? type : type.normalize_screenshot_type
|
177
|
+
browser = browser.is_a?(Symbol) ? browser : browser.normalize_browser_type
|
178
|
+
|
179
|
+
raise 'Full page screen shots only available with Firefox' if type == :full_page && browser != :firefox
|
180
|
+
|
167
181
|
urls.each do |url|
|
168
182
|
c = Curl::Html.new(url)
|
169
183
|
c.headers = headers
|
170
|
-
c.browser =
|
171
|
-
c.screenshot(options[:out], type:
|
184
|
+
c.browser = browser
|
185
|
+
c.screenshot(options[:out], type: type)
|
172
186
|
end
|
173
187
|
end
|
174
188
|
end
|
@@ -208,12 +222,26 @@ command :json do |c|
|
|
208
222
|
headers: res.headers
|
209
223
|
})
|
210
224
|
else
|
211
|
-
|
225
|
+
if options[:query]
|
226
|
+
if options[:query] =~ /^json$/
|
227
|
+
res = json
|
228
|
+
elsif options[:query] =~ /^json\./
|
229
|
+
query = options[:query].sub(/^json\./, '')
|
230
|
+
else
|
231
|
+
query = options[:query]
|
232
|
+
end
|
233
|
+
|
234
|
+
res = json.dot_query(query)
|
235
|
+
else
|
236
|
+
res = res.to_data
|
237
|
+
end
|
212
238
|
|
213
|
-
output.push(
|
239
|
+
output.push(res)
|
214
240
|
end
|
215
241
|
end
|
216
242
|
|
243
|
+
output = output[0] if output.count == 1
|
244
|
+
|
217
245
|
print_out(output, global_options[:yaml], pretty: global_options[:pretty])
|
218
246
|
end
|
219
247
|
end
|
@@ -221,12 +249,18 @@ end
|
|
221
249
|
desc 'Extract contents between two regular expressions'
|
222
250
|
arg_name 'URL', multiple: true
|
223
251
|
command :extract do |c|
|
224
|
-
c.desc 'Text before extraction
|
252
|
+
c.desc 'Text before extraction'
|
225
253
|
c.flag %i[b before]
|
226
254
|
|
227
|
-
c.desc 'Text after extraction
|
255
|
+
c.desc 'Text after extraction'
|
228
256
|
c.flag %i[a after]
|
229
257
|
|
258
|
+
c.desc 'Process before/after strings as regular expressions'
|
259
|
+
c.switch %i[r regex]
|
260
|
+
|
261
|
+
c.desc 'Include the before/after matches in the result'
|
262
|
+
c.switch %i[i include]
|
263
|
+
|
230
264
|
c.desc 'Define a header to send as key=value'
|
231
265
|
c.flag %i[h header], multiple: true
|
232
266
|
|
@@ -249,7 +283,15 @@ command :extract do |c|
|
|
249
283
|
res = Curl::Html.new(url, { headers: headers, headers_only: false,
|
250
284
|
compressed: options[:compressed], clean: options[:clean] })
|
251
285
|
res.curl
|
252
|
-
|
286
|
+
if options[:regex]
|
287
|
+
before = Regexp.new(options[:before])
|
288
|
+
after = Regexp.new(options[:after])
|
289
|
+
else
|
290
|
+
before = /#{Regexp.escape(options[:before])}/
|
291
|
+
after = /#{Regexp.escape(options[:after])}/
|
292
|
+
end
|
293
|
+
|
294
|
+
extracted = res.extract(before, after, inclusive: options[:include])
|
253
295
|
extracted.strip_tags! if options[:strip]
|
254
296
|
output.concat(extracted)
|
255
297
|
end
|
@@ -262,10 +304,10 @@ desc 'Extract all instances of a tag'
|
|
262
304
|
arg_name 'URL', multiple: true
|
263
305
|
command :tags do |c|
|
264
306
|
c.desc 'Define a header to send as key=value'
|
265
|
-
c.flag %i[h header], multiple: true
|
307
|
+
c.flag %i[h header], multiple: true, arg_name: 'KEY=VAL'
|
266
308
|
|
267
309
|
c.desc 'Specify a tag to collect'
|
268
|
-
c.flag %i[t tag], multiple: true
|
310
|
+
c.flag %i[t tag], multiple: true, arg_name: 'TAG'
|
269
311
|
|
270
312
|
c.desc 'Expect compressed results'
|
271
313
|
c.switch %i[c compressed]
|
@@ -273,8 +315,14 @@ command :tags do |c|
|
|
273
315
|
c.desc 'Remove extra whitespace from results'
|
274
316
|
c.switch %i[clean]
|
275
317
|
|
276
|
-
c.desc '
|
277
|
-
c.
|
318
|
+
c.desc 'Output the HTML source of the results'
|
319
|
+
c.switch %i[source html]
|
320
|
+
|
321
|
+
c.desc 'Dot syntax query to filter results'
|
322
|
+
c.flag %i[q query filter], arg_name: 'DOT_SYNTAX'
|
323
|
+
|
324
|
+
c.desc 'Regurn an array of matches to a CSS or XPath query'
|
325
|
+
c.flag %i[search], arg_name: 'CSS/XPATH'
|
278
326
|
|
279
327
|
c.action do |global_options, options, args|
|
280
328
|
urls = args.join(' ').split(/[, ]+/)
|
@@ -286,9 +334,17 @@ command :tags do |c|
|
|
286
334
|
res = Curl::Html.new(url, { headers: headers, headers_only: options[:headers],
|
287
335
|
compressed: options[:compressed], clean: options[:clean] })
|
288
336
|
res.curl
|
337
|
+
|
289
338
|
output = []
|
290
339
|
if options[:search]
|
291
|
-
|
340
|
+
out = res.search(options[:search])
|
341
|
+
|
342
|
+
out = out.dot_query(options[:query]) if options[:query]
|
343
|
+
output.push(out)
|
344
|
+
elsif options[:query]
|
345
|
+
query = options[:query] =~ /^links/ ? options[:query] : "links#{options[:query]}"
|
346
|
+
|
347
|
+
output = res.to_data.dot_query(query)
|
292
348
|
elsif tags.count.positive?
|
293
349
|
tags.each { |tag| output.concat(res.tags(tag)) }
|
294
350
|
else
|
@@ -296,7 +352,13 @@ command :tags do |c|
|
|
296
352
|
end
|
297
353
|
end
|
298
354
|
|
299
|
-
|
355
|
+
output = output[0] if output.count == 1
|
356
|
+
|
357
|
+
if options[:source]
|
358
|
+
puts output.to_html
|
359
|
+
else
|
360
|
+
print_out(output, global_options[:yaml], pretty: global_options[:pretty])
|
361
|
+
end
|
300
362
|
end
|
301
363
|
end
|
302
364
|
|
@@ -312,6 +374,9 @@ command :images do |c|
|
|
312
374
|
c.desc 'Remove extra whitespace from results'
|
313
375
|
c.switch %i[clean]
|
314
376
|
|
377
|
+
c.desc 'Filter output using dot-syntax path'
|
378
|
+
c.flag %i[q query filter]
|
379
|
+
|
315
380
|
c.desc 'Define a header to send as key=value'
|
316
381
|
c.flag %i[h header], multiple: true
|
317
382
|
|
@@ -326,7 +391,15 @@ command :images do |c|
|
|
326
391
|
urls.each do |url|
|
327
392
|
res = Curl::Html.new(url, { compressed: options[:compressed], clean: options[:clean] })
|
328
393
|
res.curl
|
329
|
-
|
394
|
+
|
395
|
+
res = res.images(types: types)
|
396
|
+
|
397
|
+
if options[:query]
|
398
|
+
query = options[:query] =~ /^images/ ? options[:query] : "images#{options[:query]}"
|
399
|
+
res = { images: res }.dot_query(query)
|
400
|
+
end
|
401
|
+
|
402
|
+
output.concat(res)
|
330
403
|
end
|
331
404
|
|
332
405
|
print_out(output, global_options[:yaml], pretty: global_options[:pretty])
|
@@ -367,7 +440,7 @@ command :links do |c|
|
|
367
440
|
|
368
441
|
if options[:query]
|
369
442
|
query = options[:query] =~ /^links/ ? options[:query] : "links#{options[:query]}"
|
370
|
-
queried =
|
443
|
+
queried = res.to_data.dot_query(query)
|
371
444
|
output.concat(queried) if queried
|
372
445
|
else
|
373
446
|
output.concat(res.body_links)
|
@@ -414,7 +487,7 @@ desc %(Scrape a page using a web browser, for dynamic (JS) pages. Be sure to hav
|
|
414
487
|
arg_name 'URL', multiple: true
|
415
488
|
command :scrape do |c|
|
416
489
|
c.desc 'Browser to use (firefox, chrome)'
|
417
|
-
c.flag %i[b browser], type: BrowserType
|
490
|
+
c.flag %i[b browser], type: BrowserType, required: true
|
418
491
|
|
419
492
|
c.desc 'Regurn an array of matches to a CSS or XPath query'
|
420
493
|
c.flag %i[search]
|
@@ -437,30 +510,19 @@ command :scrape do |c|
|
|
437
510
|
output = []
|
438
511
|
|
439
512
|
urls.each do |url|
|
440
|
-
|
441
|
-
|
442
|
-
driver.get url
|
443
|
-
res = driver.page_source
|
444
|
-
|
445
|
-
res = Curl::Html.new(nil, { source: res, clean: options[:clean] })
|
446
|
-
res.curl
|
447
|
-
if options[:search]
|
448
|
-
out = res.search(options[:search])
|
449
|
-
|
450
|
-
out = out.dot_query(options[:query]) if options[:query]
|
451
|
-
output.push(out)
|
452
|
-
elsif options[:query]
|
453
|
-
queried = res.to_data(url: url).dot_query(options[:query])
|
454
|
-
output = queried if queried
|
455
|
-
else
|
456
|
-
output.push(res.to_data(url: url))
|
457
|
-
end
|
513
|
+
res = Curl::Html.new(url, { browser: options[:browser], clean: options[:clean] })
|
514
|
+
res.curl
|
458
515
|
|
459
|
-
|
516
|
+
if options[:search]
|
517
|
+
out = res.search(options[:search])
|
460
518
|
|
461
|
-
|
462
|
-
|
463
|
-
|
519
|
+
out = out.dot_query(options[:query]) if options[:query]
|
520
|
+
output.push(out)
|
521
|
+
elsif options[:query]
|
522
|
+
queried = res.to_data(url: url).dot_query(options[:query])
|
523
|
+
output.push(queried) if queried
|
524
|
+
else
|
525
|
+
output.push(res.to_data(url: url))
|
464
526
|
end
|
465
527
|
end
|
466
528
|
|
data/curlyq.gemspec
CHANGED
@@ -16,10 +16,14 @@ spec = Gem::Specification.new do |s|
|
|
16
16
|
s.rdoc_options << '--title' << 'curlyq' << '--main' << 'README.rdoc' << '-ri'
|
17
17
|
s.bindir = 'bin'
|
18
18
|
s.executables << 'curlyq'
|
19
|
-
s.add_development_dependency('rake','~> 0.
|
20
|
-
s.add_development_dependency('rdoc', '~>
|
21
|
-
s.add_development_dependency('
|
19
|
+
s.add_development_dependency('rake','~> 13.0', '>= 13.0.1')
|
20
|
+
s.add_development_dependency('rdoc', '~> 6.3.1')
|
21
|
+
s.add_development_dependency('test-unit', '~> 3.4.4')
|
22
22
|
s.add_development_dependency('yard', '~> 0.9', '>= 0.9.26')
|
23
|
+
s.add_development_dependency('tty-spinner', '~> 0.9', '>= 0.9.3')
|
24
|
+
s.add_development_dependency('tty-progressbar', '~> 0.18', '>= 0.18.2')
|
25
|
+
s.add_development_dependency('pastel', '~> 0.8.0')
|
26
|
+
s.add_development_dependency('parallel_tests', '~> 3.7', '>= 3.7.3')
|
23
27
|
s.add_runtime_dependency('gli','~> 2.21.0')
|
24
28
|
s.add_runtime_dependency('tty-which','~> 0.5.0')
|
25
29
|
s.add_runtime_dependency('nokogiri','~> 1.16.0')
|
data/lib/curly/array.rb
CHANGED
@@ -66,69 +66,94 @@ class ::Array
|
|
66
66
|
replace dedup_links
|
67
67
|
end
|
68
68
|
|
69
|
+
#---------------------------------------------------------
|
70
|
+
## Run a query on array elements
|
69
71
|
##
|
70
|
-
##
|
72
|
+
## @param path [String] dot.syntax path to compare
|
71
73
|
##
|
72
|
-
## @
|
73
|
-
##
|
74
|
-
## @return [Array] Matching elements
|
74
|
+
## @return [Array] elements matching dot query
|
75
75
|
##
|
76
76
|
def dot_query(path)
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
77
|
+
filter! do |tag|
|
78
|
+
r = tag.dot_query(path)
|
79
|
+
if r.is_a?(Array)
|
80
|
+
r.count.positive?
|
81
|
+
else
|
82
|
+
r
|
83
|
+
end
|
84
84
|
end
|
85
85
|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
out = out[eval(el)] if out.is_a?(Array) && el =~ /^[\d.,]+$/
|
86
|
+
return self
|
87
|
+
end
|
88
|
+
|
89
|
+
def to_html
|
90
|
+
map { |el| el.to_html }
|
91
|
+
end
|
92
|
+
|
93
|
+
##
|
94
|
+
## Test if a tag contains an attribute matching filter queries
|
95
|
+
##
|
96
|
+
## @param tag_name [String] The tag name
|
97
|
+
## @param classes [String] The classes to match
|
98
|
+
## @param id [String] The id attribute to
|
99
|
+
## match
|
100
|
+
## @param attribute [String] The attribute
|
101
|
+
## @param operator [String] The operator, <>= *=
|
102
|
+
## $= ^=
|
103
|
+
## @param value [String] The value to match
|
104
|
+
## @param descendant [Boolean] Check descendant tags
|
105
|
+
##
|
106
|
+
def tag_match(tag_name, classes, id, attribute, operator, value, descendant: false)
|
107
|
+
tag = self
|
108
|
+
keep = true
|
109
|
+
|
110
|
+
keep = false if tag_name && !tag['tag'] =~ /^#{tag_name}$/i
|
111
|
+
|
112
|
+
if tag.key?('attrs') && tag['attrs']
|
113
|
+
if keep && id
|
114
|
+
tag_id = tag['attrs'].filter { |a| a['key'] == 'id' }.first['value']
|
115
|
+
keep = tag_id && tag_id =~ /#{id}/i
|
116
|
+
end
|
117
|
+
|
118
|
+
if keep && classes
|
119
|
+
cls = tag['attrs'].filter { |a| a['key'] == 'class' }.first
|
120
|
+
if cls
|
121
|
+
all = true
|
122
|
+
classes.each { |c| all = cls['value'].include?(c) }
|
123
|
+
keep = all
|
124
|
+
else
|
125
|
+
keep = false
|
128
126
|
end
|
129
|
-
output.push(out)
|
130
127
|
end
|
128
|
+
|
129
|
+
if keep && attribute
|
130
|
+
attributes = tag['attrs'].filter { |a| a['key'] =~ /^#{attribute}$/i }
|
131
|
+
any = false
|
132
|
+
attributes.each do |a|
|
133
|
+
break if any
|
134
|
+
|
135
|
+
any = case operator
|
136
|
+
when /^*/
|
137
|
+
a['value'] =~ /#{value}/i
|
138
|
+
when /^\^/
|
139
|
+
a['value'] =~ /^#{value}/i
|
140
|
+
when /^\$/
|
141
|
+
a['value'] =~ /#{value}$/i
|
142
|
+
else
|
143
|
+
a['value'] =~ /^#{value}$/i
|
144
|
+
end
|
145
|
+
end
|
146
|
+
keep = any
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
return false if descendant && !keep
|
151
|
+
|
152
|
+
if !descendant && tag.key?('tags')
|
153
|
+
tags = tag['tags'].filter { |t| t.tag_match(tag_name, classes, id, attribute, operator, value) }
|
154
|
+
tags.count.positive?
|
155
|
+
else
|
156
|
+
keep
|
131
157
|
end
|
132
|
-
output
|
133
158
|
end
|
134
159
|
end
|