curlyq 0.0.9 → 0.0.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +18 -0
- data/Gemfile.lock +1 -1
- data/README.md +8 -4
- data/Rakefile +17 -0
- data/bin/curlyq +16 -15
- data/lib/curly/array.rb +39 -2
- data/lib/curly/curl/html.rb +46 -7
- data/lib/curly/hash.rb +56 -6
- data/lib/curly/numeric.rb +11 -0
- data/lib/curly/string.rb +27 -3
- data/lib/curly/version.rb +3 -1
- data/lib/curly.rb +1 -0
- data/src/_README.md +5 -3
- data/test/curlyq_headlinks_test.rb +3 -2
- data/test/curlyq_html_test.rb +3 -3
- data/test/curlyq_scrape_test.rb +32 -2
- data/test/curlyq_tags_test.rb +12 -4
- data/test/helpers/curlyq-helpers.rb +1 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a9b0847eb3dd79e15b96bed47858ad0eb0df2ba7db8cf2e3395cb9e08e71c194
|
4
|
+
data.tar.gz: '06623683ff93c02087432750a150ac663c4558b7d18323bbbb367e004abd58ab'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8b7098dde55f9b76a53eff1f71a5d821a2db6d5828fb67428f2aa3ef5d6ab8e2bdbb79f5375fb5291b965ff3d0b9677cf0084782c078c2bb5575a8383bd26906
|
7
|
+
data.tar.gz: c0b02267ea0de1c490b2c2dcd171f8a992fa659733aa9bd9e0dc590988af3d7c5f4b6e38e0371ce72c879a1f956ec7f8b87e8432e684d8f7dad4f019314fa834
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,21 @@
|
|
1
|
+
### 0.0.11
|
2
|
+
|
3
|
+
2024-01-21 15:29
|
4
|
+
|
5
|
+
#### IMPROVED
|
6
|
+
|
7
|
+
- Add option for --local_links_only to html and links command, only returning links with the same origin site
|
8
|
+
|
9
|
+
### 0.0.10
|
10
|
+
|
11
|
+
2024-01-17 13:50
|
12
|
+
|
13
|
+
#### IMPROVED
|
14
|
+
|
15
|
+
- Update YARD documentation
|
16
|
+
- Breaking change, ensure all return types are Arrays, even with single objects, to aid in scriptability
|
17
|
+
- Screenshot test suite
|
18
|
+
|
1
19
|
### 0.0.9
|
2
20
|
|
3
21
|
2024-01-16 12:38
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -13,7 +13,7 @@ _If you find this useful, feel free to [buy me some coffee][donate]._
|
|
13
13
|
[jq]: https://github.com/jqlang/jq "Command-line JSON processor"
|
14
14
|
[yq]: https://github.com/mikefarah/yq "yq is a portable command-line YAML, JSON, XML, CSV, TOML and properties processor"
|
15
15
|
|
16
|
-
The current version of `curlyq` is 0.0.
|
16
|
+
The current version of `curlyq` is 0.0.11
|
17
17
|
.
|
18
18
|
|
19
19
|
CurlyQ is a utility that provides a simple interface for curl, with additional features for things like extracting images and links, finding elements by CSS selector or XPath, getting detailed header info, and more. It's designed to be part of a scripting pipeline, outputting everything as structured data (JSON or YAML). It also has rudimentary support for making calls to JSON endpoints easier, but it's expected that you'll use something like [jq] to parse the output.
|
@@ -47,7 +47,7 @@ SYNOPSIS
|
|
47
47
|
curlyq [global options] command [command options] [arguments...]
|
48
48
|
|
49
49
|
VERSION
|
50
|
-
0.0.
|
50
|
+
0.0.11
|
51
51
|
|
52
52
|
GLOBAL OPTIONS
|
53
53
|
--help - Show this message
|
@@ -94,11 +94,13 @@ Comparisons can be numeric or string comparisons. A numeric comparison like `cur
|
|
94
94
|
|
95
95
|
You can also use dot syntax inside of comparisons, e.g. `[links.rel*=me]` to target the links object (`html` command), and return only the links with a `rel=me` attribute. If the comparison is to an array object (like `class` or `rel`), it will match if any of the elements of the array match your comparison.
|
96
96
|
|
97
|
-
If you end the query with a specific key, only that key will be output. If there's only one match, it will be output as a raw string
|
97
|
+
If you end the query with a specific key, only that key will be output, but it will be in an array. If there's only one match, it will be output as a raw string as a single element in an array.
|
98
98
|
|
99
99
|
curlyq tags --search '#main .post h3' -q '[attrs.id*=what].source' 'https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/'
|
100
100
|
|
101
|
-
|
101
|
+
[
|
102
|
+
"<h3 id=\"whats-next\">What???s Next</h3>"
|
103
|
+
]
|
102
104
|
|
103
105
|
#### Commands
|
104
106
|
|
@@ -237,6 +239,7 @@ COMMAND OPTIONS
|
|
237
239
|
-h, --header=arg - Define a header to send as "key=value" (may be used more than once, default: none)
|
238
240
|
--[no-]ignore_fragments - Ignore fragment hrefs when gathering content links
|
239
241
|
--[no-]ignore_relative - Ignore relative hrefs when gathering content links
|
242
|
+
-l, --local_links_only - Only gather internal (same-site) links
|
240
243
|
-q, --query, --filter=arg - Filter output using dot-syntax path (default: none)
|
241
244
|
-r, --raw=arg - Output a raw value for a key (default: none)
|
242
245
|
-s, --search=arg - Regurn an array of matches to a CSS or XPath query (default: none)
|
@@ -379,6 +382,7 @@ COMMAND OPTIONS
|
|
379
382
|
-d, --[no-]dedup - Filter out duplicate links, preserving only first one
|
380
383
|
--[no-]ignore_fragments - Ignore fragment hrefs when gathering content links
|
381
384
|
--[no-]ignore_relative - Ignore relative hrefs when gathering content links
|
385
|
+
-l, --local_links_only - Only gather internal (same-site) links
|
382
386
|
-q, --query, --filter=arg - Filter output using dot-syntax path (default: none)
|
383
387
|
-x, --external_links_only - Only gather external links
|
384
388
|
```
|
data/Rakefile
CHANGED
@@ -56,6 +56,23 @@ task :test, :pattern, :threads, :max_tests do |_, args|
|
|
56
56
|
ThreadedTests.new.run(pattern: pattern, max_threads: args[:threads].to_i, max_tests: args[:max_tests])
|
57
57
|
end
|
58
58
|
|
59
|
+
desc 'Install current gem in all versions of asdf-controlled ruby'
|
60
|
+
task :install do
|
61
|
+
Rake::Task['clobber'].invoke
|
62
|
+
Rake::Task['package'].invoke
|
63
|
+
Dir.chdir 'pkg'
|
64
|
+
file = Dir.glob('*.gem').last
|
65
|
+
|
66
|
+
current_ruby = `asdf current ruby`.match(/(\d.\d+.\d+)/)[1]
|
67
|
+
|
68
|
+
`asdf list ruby`.split.map { |ruby| ruby.strip.sub(/^*/, '') }.each do |ruby|
|
69
|
+
`asdf shell ruby #{ruby}`
|
70
|
+
puts `gem install #{file}`
|
71
|
+
end
|
72
|
+
|
73
|
+
`asdf shell ruby #{current_ruby}`
|
74
|
+
end
|
75
|
+
|
59
76
|
desc 'Development version check'
|
60
77
|
task :ver do
|
61
78
|
gver = `git ver`
|
data/bin/curlyq
CHANGED
@@ -49,7 +49,7 @@ end
|
|
49
49
|
def self.print_out(output, yaml, raw: false, pretty: true)
|
50
50
|
output = output.to_data if output.respond_to?(:to_data)
|
51
51
|
# Was intended to flatten single responses, but not getting an array back is unpredictable
|
52
|
-
|
52
|
+
output = output.clean_output
|
53
53
|
if output.is_a?(String)
|
54
54
|
print output
|
55
55
|
elsif raw
|
@@ -103,6 +103,9 @@ command %i[html curl] do |c|
|
|
103
103
|
c.desc 'Only gather external links'
|
104
104
|
c.switch %i[x external_links_only], default_value: false, negatable: false
|
105
105
|
|
106
|
+
c.desc 'Only gather internal (same-site) links'
|
107
|
+
c.switch %i[l local_links_only], default_value: false, negatable: false
|
108
|
+
|
106
109
|
c.action do |global_options, options, args|
|
107
110
|
urls = args.join(' ').split(/[, ]+/)
|
108
111
|
headers = break_headers(options[:header])
|
@@ -115,7 +118,8 @@ command %i[html curl] do |c|
|
|
115
118
|
compressed: options[:compressed], clean: options[:clean],
|
116
119
|
ignore_local_links: options[:ignore_relative],
|
117
120
|
ignore_fragment_links: options[:ignore_fragments],
|
118
|
-
external_links_only: options[:external_links_only]
|
121
|
+
external_links_only: options[:external_links_only],
|
122
|
+
local_links_only: options[:local_links_only] }
|
119
123
|
res = Curl::Html.new(url, curl_settings)
|
120
124
|
res.curl
|
121
125
|
|
@@ -144,14 +148,9 @@ command %i[html curl] do |c|
|
|
144
148
|
end
|
145
149
|
output.delete_if(&:nil?)
|
146
150
|
output.delete_if(&:empty?)
|
147
|
-
# output = output[0] if output.count == 1
|
148
151
|
output.map! { |o| o[options[:raw].to_sym] } if options[:raw]
|
149
152
|
|
150
|
-
|
151
|
-
while output.length == 1
|
152
|
-
output = output[0]
|
153
|
-
end
|
154
|
-
end
|
153
|
+
output = output.clean_output
|
155
154
|
|
156
155
|
print_out(output, global_options[:yaml], raw: options[:raw], pretty: global_options[:pretty])
|
157
156
|
end
|
@@ -246,7 +245,7 @@ command :json do |c|
|
|
246
245
|
end
|
247
246
|
end
|
248
247
|
|
249
|
-
|
248
|
+
output = output.clean_output
|
250
249
|
|
251
250
|
print_out(output, global_options[:yaml], pretty: global_options[:pretty])
|
252
251
|
end
|
@@ -356,9 +355,7 @@ command :tags do |c|
|
|
356
355
|
end
|
357
356
|
end
|
358
357
|
|
359
|
-
|
360
|
-
output = output[0]
|
361
|
-
end
|
358
|
+
output = output.clean_output
|
362
359
|
|
363
360
|
if options[:source]
|
364
361
|
puts output.to_html
|
@@ -424,6 +421,9 @@ command :links do |c|
|
|
424
421
|
c.desc 'Only gather external links'
|
425
422
|
c.switch %i[x external_links_only], default_value: false, negatable: false
|
426
423
|
|
424
|
+
c.desc 'Only gather internal (same-site) links'
|
425
|
+
c.switch %i[l local_links_only], default_value: false, negatable: false
|
426
|
+
|
427
427
|
c.desc 'Filter output using dot-syntax path'
|
428
428
|
c.flag %i[q query filter]
|
429
429
|
|
@@ -440,7 +440,8 @@ command :links do |c|
|
|
440
440
|
compressed: options[:compressed], clean: options[:clean],
|
441
441
|
ignore_local_links: options[:ignore_relative],
|
442
442
|
ignore_fragment_links: options[:ignore_fragments],
|
443
|
-
external_links_only: options[:external_links_only]
|
443
|
+
external_links_only: options[:external_links_only],
|
444
|
+
local_links_only: options[:local_links_only]
|
444
445
|
})
|
445
446
|
res.curl
|
446
447
|
|
@@ -482,7 +483,7 @@ command :headlinks do |c|
|
|
482
483
|
end
|
483
484
|
end
|
484
485
|
|
485
|
-
output = output
|
486
|
+
output = output.clean_output
|
486
487
|
|
487
488
|
print_out(output, global_options[:yaml], pretty: global_options[:pretty])
|
488
489
|
end
|
@@ -533,7 +534,7 @@ command :scrape do |c|
|
|
533
534
|
|
534
535
|
output.delete_if(&:empty?)
|
535
536
|
|
536
|
-
output = output
|
537
|
+
output = output.clean_output
|
537
538
|
|
538
539
|
if options[:raw]
|
539
540
|
output.map! { |o| o[options[:raw].to_sym] }
|
data/lib/curly/array.rb
CHANGED
@@ -66,7 +66,7 @@ class ::Array
|
|
66
66
|
replace dedup_links
|
67
67
|
end
|
68
68
|
|
69
|
-
|
69
|
+
##
|
70
70
|
## Run a query on array elements
|
71
71
|
##
|
72
72
|
## @param path [String] dot.syntax path to compare
|
@@ -80,16 +80,29 @@ class ::Array
|
|
80
80
|
res
|
81
81
|
end
|
82
82
|
|
83
|
+
##
|
84
|
+
## Gets the value of every item in the array
|
85
|
+
##
|
86
|
+
## @param path The query path (dot syntax)
|
87
|
+
##
|
88
|
+
## @return [Array] array of values
|
89
|
+
##
|
83
90
|
def get_value(path)
|
84
91
|
map { |el| el.get_value(path) }
|
85
92
|
end
|
86
93
|
|
94
|
+
##
|
95
|
+
## Convert every item in the array to HTML
|
96
|
+
##
|
97
|
+
## @return [String] Html representation of the object.
|
98
|
+
##
|
87
99
|
def to_html
|
88
100
|
map(&:to_html)
|
89
101
|
end
|
90
102
|
|
91
103
|
##
|
92
|
-
## Test if a tag contains an attribute matching filter
|
104
|
+
## Test if a tag contains an attribute matching filter
|
105
|
+
## queries
|
93
106
|
##
|
94
107
|
## @param tag_name [String] The tag name
|
95
108
|
## @param classes [String] The classes to match
|
@@ -101,6 +114,8 @@ class ::Array
|
|
101
114
|
## @param value [String] The value to match
|
102
115
|
## @param descendant [Boolean] Check descendant tags
|
103
116
|
##
|
117
|
+
## @return [Boolean] tag matches
|
118
|
+
##
|
104
119
|
def tag_match(tag_name, classes, id, attribute, operator, value, descendant: false)
|
105
120
|
tag = self
|
106
121
|
keep = true
|
@@ -154,4 +169,26 @@ class ::Array
|
|
154
169
|
keep
|
155
170
|
end
|
156
171
|
end
|
172
|
+
|
173
|
+
##
|
174
|
+
## Clean up output, shrink single-item arrays, ensure array output
|
175
|
+
##
|
176
|
+
## @return [Array] cleaned up array
|
177
|
+
##
|
178
|
+
def clean_output
|
179
|
+
output = dup
|
180
|
+
while output.is_a?(Array) && output.count == 1
|
181
|
+
output = output[0]
|
182
|
+
end
|
183
|
+
output.ensure_array
|
184
|
+
end
|
185
|
+
|
186
|
+
##
|
187
|
+
## Ensure that an object is an array
|
188
|
+
##
|
189
|
+
## @return [Array] object as Array
|
190
|
+
##
|
191
|
+
def ensure_array
|
192
|
+
return self
|
193
|
+
end
|
157
194
|
end
|
data/lib/curly/curl/html.rb
CHANGED
@@ -11,11 +11,17 @@ module Curl
|
|
11
11
|
# Class for CURLing an HTML page
|
12
12
|
class Html
|
13
13
|
attr_accessor :settings, :browser, :source, :headers, :headers_only, :compressed, :clean, :fallback,
|
14
|
-
:ignore_local_links, :ignore_fragment_links, :external_links_only
|
14
|
+
:ignore_local_links, :ignore_fragment_links, :external_links_only, :local_links_only
|
15
15
|
|
16
16
|
attr_reader :url, :code, :meta, :links, :head, :body,
|
17
17
|
:title, :description, :body_links, :body_images
|
18
18
|
|
19
|
+
# Convert self to a hash of data
|
20
|
+
#
|
21
|
+
# @param url [String] A base url to fall back to
|
22
|
+
#
|
23
|
+
# @return [Hash] a hash of data
|
24
|
+
#
|
19
25
|
def to_data(url: nil)
|
20
26
|
{
|
21
27
|
url: @url || url,
|
@@ -63,17 +69,29 @@ module Curl
|
|
63
69
|
@ignore_local_links = options[:ignore_local_links]
|
64
70
|
@ignore_fragment_links = options[:ignore_fragment_links]
|
65
71
|
@external_links_only = options[:external_links_only]
|
72
|
+
@local_links_only = options[:local_links_only]
|
66
73
|
|
67
74
|
@curl = TTY::Which.which('curl')
|
68
75
|
@url = url.nil? ? options[:url] : url
|
69
76
|
end
|
70
77
|
|
78
|
+
##
|
79
|
+
# Parse raw HTML source instead of curling
|
80
|
+
#
|
81
|
+
# @param source [String] The source
|
82
|
+
#
|
83
|
+
#
|
84
|
+
# @return [Hash] Hash of data after processing #
|
85
|
+
#
|
71
86
|
def parse(source)
|
72
87
|
@body = source
|
73
88
|
{ url: @url, code: @code, headers: @headers, meta: @meta, links: @links, head: @head, body: source,
|
74
89
|
source: source.strip, body_links: content_links, body_images: content_images }
|
75
90
|
end
|
76
91
|
|
92
|
+
##
|
93
|
+
## Curl a url, either with curl or Selenium based on browser settings
|
94
|
+
##
|
77
95
|
def curl
|
78
96
|
res = if @url && @browser && @browser != :none
|
79
97
|
source = curl_dynamic_html
|
@@ -283,6 +301,11 @@ module Curl
|
|
283
301
|
output
|
284
302
|
end
|
285
303
|
|
304
|
+
##
|
305
|
+
## String representation
|
306
|
+
##
|
307
|
+
## @return String representation of the object.
|
308
|
+
##
|
286
309
|
def to_s
|
287
310
|
headers = @headers.nil? ? 0 : @headers.count
|
288
311
|
meta = @meta.nil? ? 0 : @meta.count
|
@@ -468,11 +491,19 @@ module Curl
|
|
468
491
|
|
469
492
|
link_href = link_href[2]
|
470
493
|
|
471
|
-
|
494
|
+
if @local_links_only
|
495
|
+
next if @ignore_fragment_links && link_href =~ /^#/
|
496
|
+
|
497
|
+
next unless same_origin?(link_href)
|
498
|
+
|
499
|
+
else
|
500
|
+
next if link_href =~ /^#/ && (@ignore_fragment_links || @external_links_only)
|
501
|
+
|
502
|
+
next if link_href !~ %r{^(\w+:)?//} && (@ignore_local_links || @external_links_only)
|
472
503
|
|
473
|
-
|
504
|
+
next if same_origin?(link_href) && @external_links_only
|
474
505
|
|
475
|
-
|
506
|
+
end
|
476
507
|
|
477
508
|
link_title = tag.match(/title=(['"])(.*?)\1/)
|
478
509
|
link_title = link_title.nil? ? nil : link_title[2]
|
@@ -500,11 +531,19 @@ module Curl
|
|
500
531
|
link_tags.each do |m|
|
501
532
|
href = m['tag'].match(/href=(["'])(.*?)\1/)
|
502
533
|
href = href[2] unless href.nil?
|
503
|
-
|
534
|
+
if @local_links_only
|
535
|
+
next if href =~ /^#/ && @ignore_fragment_links
|
536
|
+
|
537
|
+
next unless same_origin?(href)
|
504
538
|
|
505
|
-
|
539
|
+
else
|
540
|
+
next if href =~ /^#/ && (@ignore_fragment_links || @external_links_only)
|
541
|
+
|
542
|
+
next if href !~ %r{^(\w+:)?//} && (@ignore_local_links || @external_links_only)
|
506
543
|
|
507
|
-
|
544
|
+
next if same_origin?(href) && @external_links_only
|
545
|
+
|
546
|
+
end
|
508
547
|
|
509
548
|
title = m['tag'].match(/title=(["'])(.*?)\1/)
|
510
549
|
title = title[2] unless title.nil?
|
data/lib/curly/hash.rb
CHANGED
@@ -2,6 +2,14 @@
|
|
2
2
|
|
3
3
|
# Hash helpers
|
4
4
|
class ::Hash
|
5
|
+
## Convert a Curly object to data hash
|
6
|
+
##
|
7
|
+
## @return [Hash] return a hash with keys renamed and
|
8
|
+
## cleaned up
|
9
|
+
##
|
10
|
+
## @param url [String] A url to fall back to
|
11
|
+
## @param clean [Boolean] Clean extra spaces and newlines in sources
|
12
|
+
##
|
5
13
|
def to_data(url: nil, clean: false)
|
6
14
|
if key?(:body_links)
|
7
15
|
{
|
@@ -23,22 +31,32 @@ class ::Hash
|
|
23
31
|
end
|
24
32
|
end
|
25
33
|
|
34
|
+
##
|
35
|
+
## Return the raw HTML of the object
|
36
|
+
##
|
37
|
+
## @return [String] Html representation of the object.
|
38
|
+
##
|
26
39
|
def to_html
|
27
40
|
if key?(:source)
|
28
41
|
self[:source]
|
29
42
|
end
|
30
43
|
end
|
31
44
|
|
45
|
+
##
|
46
|
+
## Get a value from the hash using a dot-syntax query
|
47
|
+
##
|
48
|
+
## @param query [String] The query (dot notation)
|
49
|
+
##
|
50
|
+
## @return [Object] result of querying the hash
|
51
|
+
##
|
32
52
|
def get_value(query)
|
33
53
|
return nil if self.empty?
|
34
54
|
stringify_keys!
|
35
55
|
|
36
56
|
query.split('.').inject(self) do |v, k|
|
37
|
-
if v.is_a? Array
|
38
|
-
return v.map { |el| el.get_value(k) }
|
39
|
-
end
|
57
|
+
return v.map { |el| el.get_value(k) } if v.is_a? Array
|
40
58
|
# k = k.to_i if v.is_a? Array
|
41
|
-
next unless v.key?(k)
|
59
|
+
next v unless v.key?(k)
|
42
60
|
|
43
61
|
v.fetch(k)
|
44
62
|
end
|
@@ -48,7 +66,7 @@ class ::Hash
|
|
48
66
|
#
|
49
67
|
# @param path [String] The path
|
50
68
|
#
|
51
|
-
# @return Result of path query
|
69
|
+
# @return [Object] Result of path query
|
52
70
|
#
|
53
71
|
def dot_query(path, root = nil, full_tag: true)
|
54
72
|
res = stringify_keys
|
@@ -63,7 +81,6 @@ class ::Hash
|
|
63
81
|
"[#{inter}]"
|
64
82
|
end
|
65
83
|
|
66
|
-
enumerate = false
|
67
84
|
out = []
|
68
85
|
q = path.split(/(?<![\d.])\./)
|
69
86
|
|
@@ -152,6 +169,14 @@ class ::Hash
|
|
152
169
|
out
|
153
170
|
end
|
154
171
|
|
172
|
+
##
|
173
|
+
## Test if values in an array match an operator
|
174
|
+
##
|
175
|
+
## @param array [Array] The array
|
176
|
+
## @param key [String] The key
|
177
|
+
## @param comp [String] The comparison, e.g. *= or $=
|
178
|
+
##
|
179
|
+
## @return [Boolean] true if array contains match
|
155
180
|
def array_match(array, key, comp)
|
156
181
|
keep = false
|
157
182
|
array.each do |el|
|
@@ -353,7 +378,32 @@ class ::Hash
|
|
353
378
|
end
|
354
379
|
end
|
355
380
|
|
381
|
+
##
|
382
|
+
## Destructive version of #stringify_keys
|
383
|
+
##
|
384
|
+
## @see #stringify_keys
|
385
|
+
##
|
356
386
|
def stringify_keys!
|
357
387
|
replace stringify_keys
|
358
388
|
end
|
389
|
+
|
390
|
+
##
|
391
|
+
## Clean up empty arrays and return an array with one or
|
392
|
+
## more elements
|
393
|
+
##
|
394
|
+
## @return [Array] output array
|
395
|
+
##
|
396
|
+
def clean_output
|
397
|
+
output = ensure_array
|
398
|
+
output.clean_output
|
399
|
+
end
|
400
|
+
|
401
|
+
##
|
402
|
+
## Ensure that an object is an array
|
403
|
+
##
|
404
|
+
## @return [Array] object as Array
|
405
|
+
##
|
406
|
+
def ensure_array
|
407
|
+
return [self]
|
408
|
+
end
|
359
409
|
end
|
data/lib/curly/string.rb
CHANGED
@@ -6,6 +6,11 @@
|
|
6
6
|
## @return [String] cleaned string
|
7
7
|
##
|
8
8
|
class ::String
|
9
|
+
## Remove extra spaces and newlines, compress space
|
10
|
+
## between tags
|
11
|
+
##
|
12
|
+
## @return [String] cleaned string
|
13
|
+
##
|
9
14
|
def clean
|
10
15
|
gsub(/[\t\n ]+/m, ' ').gsub(/> +</, '><')
|
11
16
|
end
|
@@ -40,7 +45,7 @@ class ::String
|
|
40
45
|
##
|
41
46
|
## Convert an image type string to a symbol
|
42
47
|
##
|
43
|
-
## @return Symbol :srcset, :img, :opengraph, :all
|
48
|
+
## @return [Symbol] :srcset, :img, :opengraph, :all
|
44
49
|
##
|
45
50
|
def normalize_image_type(default = :all)
|
46
51
|
case self.to_s
|
@@ -58,7 +63,7 @@ class ::String
|
|
58
63
|
##
|
59
64
|
## Convert a browser type string to a symbol
|
60
65
|
##
|
61
|
-
## @return Symbol :chrome, :firefox
|
66
|
+
## @return [Symbol] :chrome, :firefox
|
62
67
|
##
|
63
68
|
def normalize_browser_type(default = :none)
|
64
69
|
case self.to_s
|
@@ -74,7 +79,7 @@ class ::String
|
|
74
79
|
##
|
75
80
|
## Convert a screenshot type string to a symbol
|
76
81
|
##
|
77
|
-
## @return Symbol :full_page, :print_page, :visible
|
82
|
+
## @return [Symbol] :full_page, :print_page, :visible
|
78
83
|
##
|
79
84
|
def normalize_screenshot_type(default = :none)
|
80
85
|
case self.to_s
|
@@ -88,4 +93,23 @@ class ::String
|
|
88
93
|
default.is_a?(Symbol) ? default.to_sym : default.normalize_browser_type
|
89
94
|
end
|
90
95
|
end
|
96
|
+
|
97
|
+
##
|
98
|
+
## Clean up output and return a single-item array
|
99
|
+
##
|
100
|
+
## @return [Array] output array
|
101
|
+
##
|
102
|
+
def clean_output
|
103
|
+
output = ensure_array
|
104
|
+
output.clean_output
|
105
|
+
end
|
106
|
+
|
107
|
+
##
|
108
|
+
## Ensure that an object is an array
|
109
|
+
##
|
110
|
+
## @return [Array] object as Array
|
111
|
+
##
|
112
|
+
def ensure_array
|
113
|
+
return [self]
|
114
|
+
end
|
91
115
|
end
|
data/lib/curly/version.rb
CHANGED
data/lib/curly.rb
CHANGED
data/src/_README.md
CHANGED
@@ -13,7 +13,7 @@ _If you find this useful, feel free to [buy me some coffee][donate]._
|
|
13
13
|
[jq]: https://github.com/jqlang/jq "Command-line JSON processor"
|
14
14
|
[yq]: https://github.com/mikefarah/yq "yq is a portable command-line YAML, JSON, XML, CSV, TOML and properties processor"
|
15
15
|
|
16
|
-
The current version of `curlyq` is <!--VER-->0.0.
|
16
|
+
The current version of `curlyq` is <!--VER-->0.0.10<!--END VER-->.
|
17
17
|
|
18
18
|
CurlyQ is a utility that provides a simple interface for curl, with additional features for things like extracting images and links, finding elements by CSS selector or XPath, getting detailed header info, and more. It's designed to be part of a scripting pipeline, outputting everything as structured data (JSON or YAML). It also has rudimentary support for making calls to JSON endpoints easier, but it's expected that you'll use something like [jq] to parse the output.
|
19
19
|
|
@@ -68,11 +68,13 @@ Comparisons can be numeric or string comparisons. A numeric comparison like `cur
|
|
68
68
|
|
69
69
|
You can also use dot syntax inside of comparisons, e.g. `[links.rel*=me]` to target the links object (`html` command), and return only the links with a `rel=me` attribute. If the comparison is to an array object (like `class` or `rel`), it will match if any of the elements of the array match your comparison.
|
70
70
|
|
71
|
-
If you end the query with a specific key, only that key will be output. If there's only one match, it will be output as a raw string
|
71
|
+
If you end the query with a specific key, only that key will be output, but it will be in an array. If there's only one match, it will be output as a raw string as a single element in an array.
|
72
72
|
|
73
73
|
curlyq tags --search '#main .post h3' -q '[attrs.id*=what].source' 'https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/'
|
74
74
|
|
75
|
-
|
75
|
+
[
|
76
|
+
"<h3 id=\"whats-next\">What’s Next</h3>"
|
77
|
+
]
|
76
78
|
|
77
79
|
#### Commands
|
78
80
|
|
@@ -17,8 +17,9 @@ class CurlyQHeadlinksTest < Test::Unit::TestCase
|
|
17
17
|
result = curlyq('headlinks', '-q', '[rel=stylesheet]', 'https://brettterpstra.com')
|
18
18
|
json = JSON.parse(result)
|
19
19
|
|
20
|
-
|
21
|
-
assert_match(/
|
20
|
+
assert_equal(Array, json.class, 'Result should be an array')
|
21
|
+
assert_match(/stylesheet/, json[0]['rel'], 'Should have retrieved a single result with rel stylesheet')
|
22
|
+
assert_match(/screen\.\d+\.css$/, json[0]['href'], 'Stylesheet should be correct primary stylesheet')
|
22
23
|
end
|
23
24
|
|
24
25
|
def test_headlinks
|
data/test/curlyq_html_test.rb
CHANGED
@@ -14,12 +14,12 @@ class CurlyQHtmlTest < Test::Unit::TestCase
|
|
14
14
|
result = curlyq('html', '-s', '#main article .aligncenter', '-q', 'images[1]', 'https://brettterpstra.com')
|
15
15
|
json = JSON.parse(result)
|
16
16
|
|
17
|
-
assert_match(/aligncenter/, json['class'], 'Should have found an image with class "aligncenter"')
|
17
|
+
assert_match(/aligncenter/, json[0]['class'], 'Should have found an image with class "aligncenter"')
|
18
18
|
end
|
19
19
|
|
20
20
|
def test_html_query
|
21
21
|
result = curlyq('html', '-q', 'meta.title', 'https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/')
|
22
|
-
|
23
|
-
assert_match(/Introducing CurlyQ/,
|
22
|
+
json = JSON.parse(result)
|
23
|
+
assert_match(/Introducing CurlyQ/, json[0], 'Should have retrived the page title')
|
24
24
|
end
|
25
25
|
end
|
data/test/curlyq_scrape_test.rb
CHANGED
@@ -11,12 +11,42 @@ class CurlyQScrapeTest < Test::Unit::TestCase
|
|
11
11
|
include CurlyQHelpers
|
12
12
|
|
13
13
|
def setup
|
14
|
+
@screenshot = File.join(File.dirname(__FILE__), 'screenshot_test')
|
15
|
+
FileUtils.rm_f("#{@screenshot}.pdf") if File.exist?("#{@screenshot}.pdf")
|
16
|
+
FileUtils.rm_f('screenshot_test.png') if File.exist?("#{@screenshot}.png")
|
17
|
+
FileUtils.rm_f("#{@screenshot}_full.png") if File.exist?("#{@screenshot}_full.png")
|
14
18
|
end
|
15
19
|
|
16
|
-
def
|
20
|
+
def teardown
|
21
|
+
FileUtils.rm_f("#{@screenshot}.pdf") if File.exist?("#{@screenshot}.pdf")
|
22
|
+
FileUtils.rm_f('screenshot_test.png') if File.exist?("#{@screenshot}.png")
|
23
|
+
FileUtils.rm_f("#{@screenshot}_full.png") if File.exist?("#{@screenshot}_full.png")
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_scrape_firefox
|
17
27
|
result = curlyq('scrape', '-b', 'firefox', '-q', 'links[rel=me&content*=mastodon][0]', 'https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/')
|
18
28
|
json = JSON.parse(result)
|
19
29
|
|
20
|
-
|
30
|
+
assert_equal(Array, json.class, 'Result should be an Array')
|
31
|
+
assert_match(/Mastodon/, json[0]['content'], 'Should have retrieved a Mastodon link')
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_scrape_chrome
|
35
|
+
result = curlyq('scrape', '-b', 'chrome', '-q', 'links[rel=me&content*=mastodon][0]', 'https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/')
|
36
|
+
json = JSON.parse(result)
|
37
|
+
|
38
|
+
assert_equal(Array, json.class, 'Result should be an Array')
|
39
|
+
assert_match(/Mastodon/, json[0]['content'], 'Should have retrieved a Mastodon link')
|
40
|
+
end
|
41
|
+
|
42
|
+
def test_screenshot
|
43
|
+
curlyq('screenshot', '-b', 'firefox', '-o', @screenshot, '-t', 'print', 'https://brettterpstra.com')
|
44
|
+
assert(File.exist?("#{@screenshot}.pdf"), 'PDF Screenshot should exist')
|
45
|
+
|
46
|
+
curlyq('screenshot', '-b', 'chrome', '-o', @screenshot, '-t', 'visible', 'https://brettterpstra.com')
|
47
|
+
assert(File.exist?("#{@screenshot}.png"), 'PNG Screenshot should exist')
|
48
|
+
|
49
|
+
curlyq('screenshot', '-b', 'firefox', '-o', "#{@screenshot}_full", '-t', 'full', 'https://brettterpstra.com')
|
50
|
+
assert(File.exist?("#{@screenshot}_full.png"), 'PNG Screenshot should exist')
|
21
51
|
end
|
22
52
|
end
|
data/test/curlyq_tags_test.rb
CHANGED
@@ -14,18 +14,26 @@ class CurlyQTagsTest < Test::Unit::TestCase
|
|
14
14
|
end
|
15
15
|
|
16
16
|
def test_tags
|
17
|
-
result = curlyq('tags', '--search', '#main .post h3', '
|
17
|
+
result = curlyq('tags', '--search', '#main .post h3', 'https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/')
|
18
18
|
json = JSON.parse(result)
|
19
19
|
|
20
|
-
assert_equal(json.
|
21
|
-
|
20
|
+
assert_equal(Array, json.class, 'Should be an array of matches')
|
21
|
+
assert_equal(6, json.count, 'Should be six results')
|
22
22
|
end
|
23
23
|
|
24
24
|
def test_clean
|
25
25
|
result = curlyq('tags', '--search', '#main section.related', '--clean', 'https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/')
|
26
26
|
json = JSON.parse(result)
|
27
27
|
|
28
|
-
assert_equal(json.
|
28
|
+
assert_equal(Array, json.class, 'Should be a single Array')
|
29
|
+
assert_equal(1, json.count, 'Should be one element')
|
29
30
|
assert_match(%r{Last.fm</h5></a></li>}, json[0]['source'], 'Should have matched #whats-next')
|
30
31
|
end
|
32
|
+
|
33
|
+
def test_query
|
34
|
+
result = curlyq('tags', '--search', '#main .post h3', '-q', '[attrs.id*=what].source', 'https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/')
|
35
|
+
json = JSON.parse(result)
|
36
|
+
assert_equal(Array, json.class, 'Should be an array')
|
37
|
+
assert_match(%r{^<h3 id="whats-next">What’s Next</h3>$}, json[0], 'Should have returned just source')
|
38
|
+
end
|
31
39
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: curlyq
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Brett Terpstra
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-01-
|
11
|
+
date: 2024-01-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -236,6 +236,7 @@ files:
|
|
236
236
|
- lib/curly/curl/html.rb
|
237
237
|
- lib/curly/curl/json.rb
|
238
238
|
- lib/curly/hash.rb
|
239
|
+
- lib/curly/numeric.rb
|
239
240
|
- lib/curly/string.rb
|
240
241
|
- lib/curly/version.rb
|
241
242
|
- src/_README.md
|