curlyq 0.0.4 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.irbrc +4 -0
- data/CHANGELOG.md +36 -0
- data/Gemfile.lock +30 -7
- data/README.md +260 -17
- data/Rakefile +32 -4
- data/bin/curlyq +114 -52
- data/curlyq.gemspec +7 -3
- data/lib/curly/array.rb +80 -55
- data/lib/curly/curl/html.rb +71 -54
- data/lib/curly/hash.rb +59 -7
- data/lib/curly/version.rb +1 -1
- data/src/_README.md +239 -1
- data/test/curlyq_extract_test.rb +43 -0
- data/test/curlyq_headlinks_test.rb +32 -0
- data/test/curlyq_html_test.rb +25 -0
- data/test/curlyq_images_test.rb +27 -0
- data/test/curlyq_json_test.rb +33 -0
- data/test/curlyq_links_test.rb +20 -0
- data/test/curlyq_scrape_test.rb +22 -0
- data/test/curlyq_tags_test.rb +31 -0
- data/test/helpers/curlyq-helpers.rb +29 -0
- data/test/helpers/fake_std_out.rb +30 -0
- data/test/helpers/threaded_tests.rb +182 -0
- data/test/test_helper.rb +7 -2
- metadata +101 -9
data/bin/curlyq
CHANGED
@@ -71,7 +71,7 @@ command %i[html curl] do |c|
|
|
71
71
|
c.switch %i[I info], negatable: false
|
72
72
|
|
73
73
|
c.desc 'Regurn an array of matches to a CSS or XPath query'
|
74
|
-
c.flag %i[search]
|
74
|
+
c.flag %i[s search]
|
75
75
|
|
76
76
|
c.desc 'Define a header to send as "key=value"'
|
77
77
|
c.flag %i[h header], multiple: true
|
@@ -110,25 +110,31 @@ command %i[html curl] do |c|
|
|
110
110
|
output = []
|
111
111
|
|
112
112
|
urls.each do |url|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
113
|
+
curl_settings = { browser: options[:browser], fallback: options[:fallback],
|
114
|
+
headers: headers, headers_only: options[:info],
|
115
|
+
compressed: options[:compressed], clean: options[:clean],
|
116
|
+
ignore_local_links: options[:ignore_relative],
|
117
|
+
ignore_fragment_links: options[:ignore_fragments],
|
118
|
+
external_links_only: options[:external_links_only] }
|
119
|
+
res = Curl::Html.new(url, curl_settings)
|
119
120
|
res.curl
|
120
121
|
|
121
122
|
if options[:info]
|
122
123
|
output.push(res.headers)
|
123
|
-
# print_out(res.headers, global_options[:yaml], raw: options[:raw], pretty: global_options[:pretty])
|
124
124
|
next
|
125
125
|
end
|
126
126
|
|
127
127
|
if options[:search]
|
128
|
-
|
128
|
+
source = res.search(options[:search], return_source: true)
|
129
129
|
|
130
|
-
out =
|
131
|
-
|
130
|
+
out = res.parse(source)
|
131
|
+
|
132
|
+
if options[:query]
|
133
|
+
out = out.to_data(url: url, clean: options[:clean]).dot_query(options[:query])
|
134
|
+
else
|
135
|
+
out = out.to_data
|
136
|
+
end
|
137
|
+
output.push([out])
|
132
138
|
elsif options[:query]
|
133
139
|
queried = res.to_data.dot_query(options[:query])
|
134
140
|
output.push(queried) if queried
|
@@ -136,7 +142,7 @@ command %i[html curl] do |c|
|
|
136
142
|
output.push(res.to_data(url: url))
|
137
143
|
end
|
138
144
|
end
|
139
|
-
|
145
|
+
output.delete_if(&:nil?)
|
140
146
|
output.delete_if(&:empty?)
|
141
147
|
output = output[0] if output.count == 1
|
142
148
|
output.map! { |o| o[options[:raw].to_sym] } if options[:raw]
|
@@ -149,13 +155,13 @@ desc 'Save a screenshot of a URL'
|
|
149
155
|
arg_name 'URL', multiple: true
|
150
156
|
command :screenshot do |c|
|
151
157
|
c.desc 'Type of screenshot to save (full (requires firefox), print, visible)'
|
152
|
-
c.flag %i[t type], type: ScreenshotType, must_match: /^[fpv].*?$/, default_value: '
|
158
|
+
c.flag %i[t type], type: ScreenshotType, must_match: /^[fpv].*?$/, default_value: 'visible'
|
153
159
|
|
154
160
|
c.desc 'Browser to use (firefox, chrome)'
|
155
161
|
c.flag %i[b browser], type: BrowserType, must_match: /^[fc].*?$/, default_value: 'chrome'
|
156
162
|
|
157
163
|
c.desc 'File destination'
|
158
|
-
c.flag %i[o out file]
|
164
|
+
c.flag %i[o out file], required: true
|
159
165
|
|
160
166
|
c.desc 'Define a header to send as key=value'
|
161
167
|
c.flag %i[h header], multiple: true
|
@@ -164,11 +170,19 @@ command :screenshot do |c|
|
|
164
170
|
urls = args.join(' ').split(/[, ]+/)
|
165
171
|
headers = break_headers(options[:header])
|
166
172
|
|
173
|
+
type = options[:type]
|
174
|
+
browser = options[:browser]
|
175
|
+
|
176
|
+
type = type.is_a?(Symbol) ? type : type.normalize_screenshot_type
|
177
|
+
browser = browser.is_a?(Symbol) ? browser : browser.normalize_browser_type
|
178
|
+
|
179
|
+
raise 'Full page screen shots only available with Firefox' if type == :full_page && browser != :firefox
|
180
|
+
|
167
181
|
urls.each do |url|
|
168
182
|
c = Curl::Html.new(url)
|
169
183
|
c.headers = headers
|
170
|
-
c.browser =
|
171
|
-
c.screenshot(options[:out], type:
|
184
|
+
c.browser = browser
|
185
|
+
c.screenshot(options[:out], type: type)
|
172
186
|
end
|
173
187
|
end
|
174
188
|
end
|
@@ -208,12 +222,26 @@ command :json do |c|
|
|
208
222
|
headers: res.headers
|
209
223
|
})
|
210
224
|
else
|
211
|
-
|
225
|
+
if options[:query]
|
226
|
+
if options[:query] =~ /^json$/
|
227
|
+
res = json
|
228
|
+
elsif options[:query] =~ /^json\./
|
229
|
+
query = options[:query].sub(/^json\./, '')
|
230
|
+
else
|
231
|
+
query = options[:query]
|
232
|
+
end
|
233
|
+
|
234
|
+
res = json.dot_query(query)
|
235
|
+
else
|
236
|
+
res = res.to_data
|
237
|
+
end
|
212
238
|
|
213
|
-
output.push(
|
239
|
+
output.push(res)
|
214
240
|
end
|
215
241
|
end
|
216
242
|
|
243
|
+
output = output[0] if output.count == 1
|
244
|
+
|
217
245
|
print_out(output, global_options[:yaml], pretty: global_options[:pretty])
|
218
246
|
end
|
219
247
|
end
|
@@ -221,12 +249,18 @@ end
|
|
221
249
|
desc 'Extract contents between two regular expressions'
|
222
250
|
arg_name 'URL', multiple: true
|
223
251
|
command :extract do |c|
|
224
|
-
c.desc 'Text before extraction
|
252
|
+
c.desc 'Text before extraction'
|
225
253
|
c.flag %i[b before]
|
226
254
|
|
227
|
-
c.desc 'Text after extraction
|
255
|
+
c.desc 'Text after extraction'
|
228
256
|
c.flag %i[a after]
|
229
257
|
|
258
|
+
c.desc 'Process before/after strings as regular expressions'
|
259
|
+
c.switch %i[r regex]
|
260
|
+
|
261
|
+
c.desc 'Include the before/after matches in the result'
|
262
|
+
c.switch %i[i include]
|
263
|
+
|
230
264
|
c.desc 'Define a header to send as key=value'
|
231
265
|
c.flag %i[h header], multiple: true
|
232
266
|
|
@@ -249,7 +283,15 @@ command :extract do |c|
|
|
249
283
|
res = Curl::Html.new(url, { headers: headers, headers_only: false,
|
250
284
|
compressed: options[:compressed], clean: options[:clean] })
|
251
285
|
res.curl
|
252
|
-
|
286
|
+
if options[:regex]
|
287
|
+
before = Regexp.new(options[:before])
|
288
|
+
after = Regexp.new(options[:after])
|
289
|
+
else
|
290
|
+
before = /#{Regexp.escape(options[:before])}/
|
291
|
+
after = /#{Regexp.escape(options[:after])}/
|
292
|
+
end
|
293
|
+
|
294
|
+
extracted = res.extract(before, after, inclusive: options[:include])
|
253
295
|
extracted.strip_tags! if options[:strip]
|
254
296
|
output.concat(extracted)
|
255
297
|
end
|
@@ -262,10 +304,10 @@ desc 'Extract all instances of a tag'
|
|
262
304
|
arg_name 'URL', multiple: true
|
263
305
|
command :tags do |c|
|
264
306
|
c.desc 'Define a header to send as key=value'
|
265
|
-
c.flag %i[h header], multiple: true
|
307
|
+
c.flag %i[h header], multiple: true, arg_name: 'KEY=VAL'
|
266
308
|
|
267
309
|
c.desc 'Specify a tag to collect'
|
268
|
-
c.flag %i[t tag], multiple: true
|
310
|
+
c.flag %i[t tag], multiple: true, arg_name: 'TAG'
|
269
311
|
|
270
312
|
c.desc 'Expect compressed results'
|
271
313
|
c.switch %i[c compressed]
|
@@ -273,8 +315,14 @@ command :tags do |c|
|
|
273
315
|
c.desc 'Remove extra whitespace from results'
|
274
316
|
c.switch %i[clean]
|
275
317
|
|
276
|
-
c.desc '
|
277
|
-
c.
|
318
|
+
c.desc 'Output the HTML source of the results'
|
319
|
+
c.switch %i[source html]
|
320
|
+
|
321
|
+
c.desc 'Dot syntax query to filter results'
|
322
|
+
c.flag %i[q query filter], arg_name: 'DOT_SYNTAX'
|
323
|
+
|
324
|
+
c.desc 'Regurn an array of matches to a CSS or XPath query'
|
325
|
+
c.flag %i[search], arg_name: 'CSS/XPATH'
|
278
326
|
|
279
327
|
c.action do |global_options, options, args|
|
280
328
|
urls = args.join(' ').split(/[, ]+/)
|
@@ -286,9 +334,17 @@ command :tags do |c|
|
|
286
334
|
res = Curl::Html.new(url, { headers: headers, headers_only: options[:headers],
|
287
335
|
compressed: options[:compressed], clean: options[:clean] })
|
288
336
|
res.curl
|
337
|
+
|
289
338
|
output = []
|
290
339
|
if options[:search]
|
291
|
-
|
340
|
+
out = res.search(options[:search])
|
341
|
+
|
342
|
+
out = out.dot_query(options[:query]) if options[:query]
|
343
|
+
output.push(out)
|
344
|
+
elsif options[:query]
|
345
|
+
query = options[:query] =~ /^links/ ? options[:query] : "links#{options[:query]}"
|
346
|
+
|
347
|
+
output = res.to_data.dot_query(query)
|
292
348
|
elsif tags.count.positive?
|
293
349
|
tags.each { |tag| output.concat(res.tags(tag)) }
|
294
350
|
else
|
@@ -296,7 +352,13 @@ command :tags do |c|
|
|
296
352
|
end
|
297
353
|
end
|
298
354
|
|
299
|
-
|
355
|
+
output = output[0] if output.count == 1
|
356
|
+
|
357
|
+
if options[:source]
|
358
|
+
puts output.to_html
|
359
|
+
else
|
360
|
+
print_out(output, global_options[:yaml], pretty: global_options[:pretty])
|
361
|
+
end
|
300
362
|
end
|
301
363
|
end
|
302
364
|
|
@@ -312,6 +374,9 @@ command :images do |c|
|
|
312
374
|
c.desc 'Remove extra whitespace from results'
|
313
375
|
c.switch %i[clean]
|
314
376
|
|
377
|
+
c.desc 'Filter output using dot-syntax path'
|
378
|
+
c.flag %i[q query filter]
|
379
|
+
|
315
380
|
c.desc 'Define a header to send as key=value'
|
316
381
|
c.flag %i[h header], multiple: true
|
317
382
|
|
@@ -326,7 +391,15 @@ command :images do |c|
|
|
326
391
|
urls.each do |url|
|
327
392
|
res = Curl::Html.new(url, { compressed: options[:compressed], clean: options[:clean] })
|
328
393
|
res.curl
|
329
|
-
|
394
|
+
|
395
|
+
res = res.images(types: types)
|
396
|
+
|
397
|
+
if options[:query]
|
398
|
+
query = options[:query] =~ /^images/ ? options[:query] : "images#{options[:query]}"
|
399
|
+
res = { images: res }.dot_query(query)
|
400
|
+
end
|
401
|
+
|
402
|
+
output.concat(res)
|
330
403
|
end
|
331
404
|
|
332
405
|
print_out(output, global_options[:yaml], pretty: global_options[:pretty])
|
@@ -367,7 +440,7 @@ command :links do |c|
|
|
367
440
|
|
368
441
|
if options[:query]
|
369
442
|
query = options[:query] =~ /^links/ ? options[:query] : "links#{options[:query]}"
|
370
|
-
queried =
|
443
|
+
queried = res.to_data.dot_query(query)
|
371
444
|
output.concat(queried) if queried
|
372
445
|
else
|
373
446
|
output.concat(res.body_links)
|
@@ -414,7 +487,7 @@ desc %(Scrape a page using a web browser, for dynamic (JS) pages. Be sure to hav
|
|
414
487
|
arg_name 'URL', multiple: true
|
415
488
|
command :scrape do |c|
|
416
489
|
c.desc 'Browser to use (firefox, chrome)'
|
417
|
-
c.flag %i[b browser], type: BrowserType
|
490
|
+
c.flag %i[b browser], type: BrowserType, required: true
|
418
491
|
|
419
492
|
c.desc 'Regurn an array of matches to a CSS or XPath query'
|
420
493
|
c.flag %i[search]
|
@@ -437,30 +510,19 @@ command :scrape do |c|
|
|
437
510
|
output = []
|
438
511
|
|
439
512
|
urls.each do |url|
|
440
|
-
|
441
|
-
|
442
|
-
driver.get url
|
443
|
-
res = driver.page_source
|
444
|
-
|
445
|
-
res = Curl::Html.new(nil, { source: res, clean: options[:clean] })
|
446
|
-
res.curl
|
447
|
-
if options[:search]
|
448
|
-
out = res.search(options[:search])
|
449
|
-
|
450
|
-
out = out.dot_query(options[:query]) if options[:query]
|
451
|
-
output.push(out)
|
452
|
-
elsif options[:query]
|
453
|
-
queried = res.to_data(url: url).dot_query(options[:query])
|
454
|
-
output = queried if queried
|
455
|
-
else
|
456
|
-
output.push(res.to_data(url: url))
|
457
|
-
end
|
513
|
+
res = Curl::Html.new(url, { browser: options[:browser], clean: options[:clean] })
|
514
|
+
res.curl
|
458
515
|
|
459
|
-
|
516
|
+
if options[:search]
|
517
|
+
out = res.search(options[:search])
|
460
518
|
|
461
|
-
|
462
|
-
|
463
|
-
|
519
|
+
out = out.dot_query(options[:query]) if options[:query]
|
520
|
+
output.push(out)
|
521
|
+
elsif options[:query]
|
522
|
+
queried = res.to_data(url: url).dot_query(options[:query])
|
523
|
+
output.push(queried) if queried
|
524
|
+
else
|
525
|
+
output.push(res.to_data(url: url))
|
464
526
|
end
|
465
527
|
end
|
466
528
|
|
data/curlyq.gemspec
CHANGED
@@ -16,10 +16,14 @@ spec = Gem::Specification.new do |s|
|
|
16
16
|
s.rdoc_options << '--title' << 'curlyq' << '--main' << 'README.rdoc' << '-ri'
|
17
17
|
s.bindir = 'bin'
|
18
18
|
s.executables << 'curlyq'
|
19
|
-
s.add_development_dependency('rake','~> 0.
|
20
|
-
s.add_development_dependency('rdoc', '~>
|
21
|
-
s.add_development_dependency('
|
19
|
+
s.add_development_dependency('rake','~> 13.0', '>= 13.0.1')
|
20
|
+
s.add_development_dependency('rdoc', '~> 6.3.1')
|
21
|
+
s.add_development_dependency('test-unit', '~> 3.4.4')
|
22
22
|
s.add_development_dependency('yard', '~> 0.9', '>= 0.9.26')
|
23
|
+
s.add_development_dependency('tty-spinner', '~> 0.9', '>= 0.9.3')
|
24
|
+
s.add_development_dependency('tty-progressbar', '~> 0.18', '>= 0.18.2')
|
25
|
+
s.add_development_dependency('pastel', '~> 0.8.0')
|
26
|
+
s.add_development_dependency('parallel_tests', '~> 3.7', '>= 3.7.3')
|
23
27
|
s.add_runtime_dependency('gli','~> 2.21.0')
|
24
28
|
s.add_runtime_dependency('tty-which','~> 0.5.0')
|
25
29
|
s.add_runtime_dependency('nokogiri','~> 1.16.0')
|
data/lib/curly/array.rb
CHANGED
@@ -66,69 +66,94 @@ class ::Array
|
|
66
66
|
replace dedup_links
|
67
67
|
end
|
68
68
|
|
69
|
+
#---------------------------------------------------------
|
70
|
+
## Run a query on array elements
|
69
71
|
##
|
70
|
-
##
|
72
|
+
## @param path [String] dot.syntax path to compare
|
71
73
|
##
|
72
|
-
## @
|
73
|
-
##
|
74
|
-
## @return [Array] Matching elements
|
74
|
+
## @return [Array] elements matching dot query
|
75
75
|
##
|
76
76
|
def dot_query(path)
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
77
|
+
filter! do |tag|
|
78
|
+
r = tag.dot_query(path)
|
79
|
+
if r.is_a?(Array)
|
80
|
+
r.count.positive?
|
81
|
+
else
|
82
|
+
r
|
83
|
+
end
|
84
84
|
end
|
85
85
|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
out = out[eval(el)] if out.is_a?(Array) && el =~ /^[\d.,]+$/
|
86
|
+
return self
|
87
|
+
end
|
88
|
+
|
89
|
+
def to_html
|
90
|
+
map { |el| el.to_html }
|
91
|
+
end
|
92
|
+
|
93
|
+
##
|
94
|
+
## Test if a tag contains an attribute matching filter queries
|
95
|
+
##
|
96
|
+
## @param tag_name [String] The tag name
|
97
|
+
## @param classes [String] The classes to match
|
98
|
+
## @param id [String] The id attribute to
|
99
|
+
## match
|
100
|
+
## @param attribute [String] The attribute
|
101
|
+
## @param operator [String] The operator, <>= *=
|
102
|
+
## $= ^=
|
103
|
+
## @param value [String] The value to match
|
104
|
+
## @param descendant [Boolean] Check descendant tags
|
105
|
+
##
|
106
|
+
def tag_match(tag_name, classes, id, attribute, operator, value, descendant: false)
|
107
|
+
tag = self
|
108
|
+
keep = true
|
109
|
+
|
110
|
+
keep = false if tag_name && !tag['tag'] =~ /^#{tag_name}$/i
|
111
|
+
|
112
|
+
if tag.key?('attrs') && tag['attrs']
|
113
|
+
if keep && id
|
114
|
+
tag_id = tag['attrs'].filter { |a| a['key'] == 'id' }.first['value']
|
115
|
+
keep = tag_id && tag_id =~ /#{id}/i
|
116
|
+
end
|
117
|
+
|
118
|
+
if keep && classes
|
119
|
+
cls = tag['attrs'].filter { |a| a['key'] == 'class' }.first
|
120
|
+
if cls
|
121
|
+
all = true
|
122
|
+
classes.each { |c| all = cls['value'].include?(c) }
|
123
|
+
keep = all
|
124
|
+
else
|
125
|
+
keep = false
|
128
126
|
end
|
129
|
-
output.push(out)
|
130
127
|
end
|
128
|
+
|
129
|
+
if keep && attribute
|
130
|
+
attributes = tag['attrs'].filter { |a| a['key'] =~ /^#{attribute}$/i }
|
131
|
+
any = false
|
132
|
+
attributes.each do |a|
|
133
|
+
break if any
|
134
|
+
|
135
|
+
any = case operator
|
136
|
+
when /^*/
|
137
|
+
a['value'] =~ /#{value}/i
|
138
|
+
when /^\^/
|
139
|
+
a['value'] =~ /^#{value}/i
|
140
|
+
when /^\$/
|
141
|
+
a['value'] =~ /#{value}$/i
|
142
|
+
else
|
143
|
+
a['value'] =~ /^#{value}$/i
|
144
|
+
end
|
145
|
+
end
|
146
|
+
keep = any
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
return false if descendant && !keep
|
151
|
+
|
152
|
+
if !descendant && tag.key?('tags')
|
153
|
+
tags = tag['tags'].filter { |t| t.tag_match(tag_name, classes, id, attribute, operator, value) }
|
154
|
+
tags.count.positive?
|
155
|
+
else
|
156
|
+
keep
|
131
157
|
end
|
132
|
-
output
|
133
158
|
end
|
134
159
|
end
|