curlyq 0.0.4 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -65,7 +65,13 @@ module Curl
65
65
  @external_links_only = options[:external_links_only]
66
66
 
67
67
  @curl = TTY::Which.which('curl')
68
- @url = url
68
+ @url = url.nil? ? options[:url] : url
69
+ end
70
+
71
+ def parse(source)
72
+ @body = source
73
+ { url: @url, code: @code, headers: @headers, meta: @meta, links: @links, head: @head, body: source,
74
+ source: source.strip, body_links: content_links, body_images: content_images }
69
75
  end
70
76
 
71
77
  def curl
@@ -118,10 +124,15 @@ module Curl
118
124
  ##
119
125
  ## @return [Array] array of matches
120
126
  ##
121
- def extract(before, after)
122
- before = /#{Regexp.escape(before)}/ unless before.instance_of?(Regexp)
123
- after = /#{Regexp.escape(after)}/ unless after.instance_of?(Regexp)
124
- rx = /(?<=#{before.source})(.*?)(?=#{after.source})/m
127
+ def extract(before, after, inclusive: false)
128
+ before = /#{Regexp.escape(before)}/ unless before.is_a?(Regexp)
129
+ after = /#{Regexp.escape(after)}/ unless after.is_a?(Regexp)
130
+
131
+ if inclusive
132
+ rx = /(#{before.source}.*?#{after.source})/m
133
+ else
134
+ rx = /(?<=#{before.source})(.*?)(?=#{after.source})/m
135
+ end
125
136
  @body.scan(rx).map { |r| @clean ? r[0].clean : r[0] }
126
137
  end
127
138
 
@@ -232,11 +243,11 @@ module Curl
232
243
  when /source/
233
244
  next unless %i[all srcset].include?(type)
234
245
 
235
- srcsets = img[:attrs].filter { |k| k[:key] =~ /srcset/i }
246
+ srcsets = img[:attrs].filter { |k| k == 'srcset' }
236
247
  if srcsets.count.positive?
237
248
  srcset = []
238
- srcsets.each do |src|
239
- src[:value].split(/ *, */).each do |s|
249
+ srcsets.each do |k, v|
250
+ v.split(/ *, */).each do |s|
240
251
  image, media = s.split(/ /)
241
252
  srcset << {
242
253
  src: image,
@@ -252,15 +263,14 @@ module Curl
252
263
  end
253
264
  when /img/
254
265
  next unless %i[all img].include?(type)
255
-
256
- width = img[:attrs].select { |a| a[:key] == 'width' }.first[:value]
257
- height = img[:attrs].select { |a| a[:key] == 'height' }.first[:value]
258
- alt = img[:attrs].select { |a| a[:key] == 'alt' }.first[:value]
259
- title = img[:attrs].select { |a| a[:key] == 'title' }.first[:value]
266
+ width = img[:attrs]['width']
267
+ height = img[:attrs]['height']
268
+ alt = img[:attrs]['alt']
269
+ title = img[:attrs]['title']
260
270
 
261
271
  output << {
262
272
  type: 'img',
263
- src: img[:attrs].filter { |a| a[:key] =~ /src/i }.first[:value],
273
+ src: img[:attrs]['src'],
264
274
  width: width || 'unknown',
265
275
  height: height || 'unknown',
266
276
  alt: alt,
@@ -313,8 +323,9 @@ module Curl
313
323
  ## @param el [Nokogiri] element to convert
314
324
  ##
315
325
  def nokogiri_to_tag(el)
316
- attributes = el.attribute_nodes.map do |a|
317
- { key: a.name, value: a.name =~ /^(class|rel)$/ ? a.value.split(/ /) : a.value }
326
+ attributes = {}
327
+ attributes = el.attribute_nodes.each_with_object({}) do |a, hsh|
328
+ hsh[a.name] = a.name =~ /^(class|rel)$/ ? a.value.split(/ /) : a.value
318
329
  end
319
330
 
320
331
  {
@@ -343,12 +354,16 @@ module Curl
343
354
  ##
344
355
  ## @return [Array] array of matched elements
345
356
  ##
346
- def search(path, source: @source)
357
+ def search(path, source: @source, return_source: false)
347
358
  doc = Nokogiri::HTML(source)
348
359
  output = []
349
- doc.search(path).each do |el|
350
- out = nokogiri_to_tag(el)
351
- output.push(out)
360
+ if return_source
361
+ output = doc.search(path).to_html
362
+ else
363
+ doc.search(path).each do |el|
364
+ out = nokogiri_to_tag(el)
365
+ output.push(out)
366
+ end
352
367
  end
353
368
  output
354
369
  end
@@ -390,12 +405,12 @@ module Curl
390
405
  attrs = tag['attrs'].strip.to_enum(:scan, /(?ix)
391
406
  (?<key>[@a-z0-9-]+)(?:=(?<quot>["'])
392
407
  (?<value>[^"']+)\k<quot>|[ >])?/i).map { Regexp.last_match }
393
- attrs.map! { |a| { key: a['key'], value: a['key'] =~ /^(class|rel)$/ ? a['value'].split(/ /) : a['value'] } }
408
+ attributes = attrs.each_with_object({}) { |a, hsh| hsh[a['key']] = a['key'] =~ /^(class|rel)$/ ? a['value'].split(/ /) : a['value'] }
394
409
  end
395
410
  {
396
411
  tag: tag['tag'],
397
412
  source: tag.to_s,
398
- attrs: attrs,
413
+ attrs: attributes,
399
414
  content: @clean ? tag['content']&.clean : tag['content'],
400
415
  tags: content_tags(tag['content'])
401
416
  }
@@ -480,6 +495,7 @@ module Curl
480
495
  ##
481
496
  def content_links
482
497
  links = []
498
+
483
499
  link_tags = @body.to_enum(:scan, %r{<a ?(?<tag>.*?)>(?<text>.*?)</a>}).map { Regexp.last_match }
484
500
  link_tags.each do |m|
485
501
  href = m['tag'].match(/href=(["'])(.*?)\1/)
@@ -534,7 +550,7 @@ module Curl
534
550
  ## @return [String] page source
535
551
  ##
536
552
  def curl_dynamic_html
537
- browser = @browser.normalize_browser_type if @browser.is_a?(String)
553
+ browser = @browser.is_a?(String) ? @browser.normalize_browser_type : @browser
538
554
  res = nil
539
555
 
540
556
  driver = Selenium::WebDriver.for browser
@@ -607,7 +623,7 @@ module Curl
607
623
  ##
608
624
  def curl_html(url = nil, source: nil, headers: nil,
609
625
  headers_only: false, compressed: false, fallback: false)
610
- unless url.nil?
626
+ if !url.nil?
611
627
  flags = 'SsL'
612
628
  flags += @headers_only ? 'I' : 'i'
613
629
  agents = [
@@ -620,8 +636,8 @@ module Curl
620
636
  compress = @compressed ? '--compressed' : ''
621
637
  @source = `#{@curl} -#{flags} #{compress} #{headers} '#{@url}' 2>/dev/null`
622
638
  agent = 0
623
- while source.nil? || source.empty?
624
- source = `#{@curl} -#{flags} #{compress} -A "#{agents[agent]}" #{headers} '#{@url}' 2>/dev/null`
639
+ while @source.nil? || @source.empty?
640
+ @source = `#{@curl} -#{flags} #{compress} -A "#{agents[agent]}" #{headers} '#{@url}' 2>/dev/null`
625
641
  break if agent >= agents.count - 1
626
642
  end
627
643
 
@@ -630,49 +646,50 @@ module Curl
630
646
  Process.exit 1
631
647
  end
632
648
 
633
- if @fallback && (@source.nil? || @source.empty?)
634
- @source = curl_dynamic_html(@url, @fallback, @headers)
649
+ headers = { 'location' => @url }
650
+ lines = @source.split(/\r\n/)
651
+ code = lines[0].match(/(\d\d\d)/)[1]
652
+ lines.shift
653
+ lines.each_with_index do |line, idx|
654
+ if line =~ /^([\w-]+): (.*?)$/
655
+ m = Regexp.last_match
656
+ headers[m[1]] = m[2]
657
+ else
658
+ @source = lines[idx..].join("\n")
659
+ break
660
+ end
635
661
  end
636
- end
637
662
 
638
- return false if source.nil? || source.empty?
639
-
640
- @source.strip!
663
+ if headers['content-encoding'] =~ /gzip/i && !compressed
664
+ warn 'Response is gzipped, you may need to try again with --compressed'
665
+ end
641
666
 
642
- headers = { 'location' => @url }
643
- lines = @source.split(/\r\n/)
644
- code = lines[0].match(/(\d\d\d)/)[1]
645
- lines.shift
646
- lines.each_with_index do |line, idx|
647
- if line =~ /^([\w-]+): (.*?)$/
648
- m = Regexp.last_match
649
- headers[m[1]] = m[2]
650
- else
651
- @source = lines[idx..].join("\n")
652
- break
667
+ if headers['content-type'] =~ /json/
668
+ return { url: @url, code: code, headers: headers, meta: nil, links: nil,
669
+ head: nil, body: @source.strip, source: @source.strip, body_links: nil, body_images: nil }
653
670
  end
671
+ else
672
+ @source = source unless source.nil?
654
673
  end
655
674
 
656
- if headers['content-encoding'] =~ /gzip/i && !compressed
657
- warn 'Response is gzipped, you may need to try again with --compressed'
658
- end
675
+ @source = curl_dynamic_html(@url, @fallback, @headers) if @fallback && (@source.nil? || @source.empty?)
659
676
 
660
- if headers['content-type'] =~ /json/
661
- return { url: @url, code: code, headers: headers, meta: nil, links: nil,
662
- head: nil, body: @source.strip, source: @source.strip, body_links: nil, body_images: nil }
663
- end
677
+ return false if @source.nil? || @source.empty?
678
+
679
+ @source.strip!
664
680
 
665
- head = source.match(%r{(?<=<head>)(.*?)(?=</head>)}mi)
681
+ head = @source.match(%r{(?<=<head>)(.*?)(?=</head>)}mi)
666
682
 
667
683
  if head.nil?
668
684
  { url: @url, code: code, headers: headers, meta: nil, links: nil, head: nil, body: @source.strip,
669
685
  source: @source.strip, body_links: nil, body_images: nil }
670
686
  else
687
+ @body = @source.match(%r{<body.*?>(.*?)</body>}mi)[1]
671
688
  meta = meta_tags(head[1])
672
689
  links = link_tags(head[1])
673
- body = @source.match(%r{<body.*?>(.*?)</body>}mi)[1]
674
- { url: @url, code: code, headers: headers, meta: meta, links: links, head: head[1], body: body,
675
- source: @source.strip, body_links: body_links, body_images: body_images }
690
+
691
+ { url: @url, code: code, headers: headers, meta: meta, links: links, head: head[1], body: @body,
692
+ source: @source.strip, body_links: nil, body_images: nil }
676
693
  end
677
694
  end
678
695
 
data/lib/curly/hash.rb CHANGED
@@ -2,6 +2,33 @@
2
2
 
3
3
  # Hash helpers
4
4
  class ::Hash
5
+ def to_data(url: nil, clean: false)
6
+ if key?(:body_links)
7
+ {
8
+ url: self[:url] || url,
9
+ code: self[:code],
10
+ headers: self[:headers],
11
+ meta: self[:meta],
12
+ meta_links: self[:links],
13
+ head: clean ? self[:head]&.strip&.clean : self[:head],
14
+ body: clean ? self[:body]&.strip&.clean : self[:body],
15
+ source: clean ? self[:source]&.strip&.clean : self[:source],
16
+ title: self[:title],
17
+ description: self[:description],
18
+ links: self[:body_links],
19
+ images: self[:body_images]
20
+ }
21
+ else
22
+ self
23
+ end
24
+ end
25
+
26
+ def to_html
27
+ if key?(:source)
28
+ self[:source]
29
+ end
30
+ end
31
+
5
32
  # Extract data using a dot-syntax path
6
33
  #
7
34
  # @param path [String] The path
@@ -10,6 +37,7 @@ class ::Hash
10
37
  #
11
38
  def dot_query(path)
12
39
  res = stringify_keys
40
+
13
41
  out = []
14
42
  q = path.split(/(?<![\d.])\./)
15
43
  q.each do |pth|
@@ -18,7 +46,7 @@ class ::Hash
18
46
  ats = []
19
47
  at = []
20
48
  while pth =~ /\[[+&,]?\w+ *[\^*$=<>]=? *\w+/
21
- m = pth.match(/\[(?<com>[,+&])? *(?<key>\w+) *(?<op>[\^*$=<>]{1,2}) *(?<val>\w+) */)
49
+ m = pth.match(/\[(?<com>[,+&])? *(?<key>\w+) *(?<op>[\^*$=<>]{1,2}) *(?<val>[^,&\]]+) */)
22
50
  comp = [m['key'], m['op'], m['val']]
23
51
  case m['com']
24
52
  when ','
@@ -28,22 +56,27 @@ class ::Hash
28
56
  at.push(comp)
29
57
  end
30
58
 
31
- pth.sub!(/\[(?<com>[,&+])? *(?<key>\w+) *(?<op>[\^*$=<>]{1,2}) *(?<val>\w+)/, '[')
59
+ pth.sub!(/\[(?<com>[,&+])? *(?<key>\w+) *(?<op>[\^*$=<>]{1,2}) *(?<val>[^,&\]]+)/, '[')
32
60
  end
33
61
  ats.push(at) unless at.empty?
34
62
  pth.sub!(/\[\]/, '')
35
63
 
64
+ res = res[0] if res.is_a?(Array)
65
+
36
66
  return false if el.nil? && ats.empty? && !res.key?(pth)
37
67
 
38
68
  res = res[pth] unless pth.empty?
39
69
 
70
+ return false if res.nil?
71
+
40
72
  if ats.count.positive?
41
73
  while ats.count.positive?
42
74
  atr = ats.shift
43
-
75
+ res = [res] if res.is_a?(Hash)
44
76
  keepers = res.filter do |r|
45
77
  evaluate_comp(r, atr)
46
78
  end
79
+
47
80
  out.concat(keepers)
48
81
  end
49
82
  else
@@ -52,6 +85,7 @@ class ::Hash
52
85
 
53
86
  out = out[eval(el)] if out.is_a?(Array) && el =~ /^[\d.,]+$/
54
87
  end
88
+
55
89
  out
56
90
  end
57
91
 
@@ -60,13 +94,15 @@ class ::Hash
60
94
  ##
61
95
  ## @param r [Hash] hash of source elements and
62
96
  ## comparison operators
63
- ## @param atr [String] The attribute to compare
97
+ ## @param atr [Array] Array of arrays conaining [attribute,comparitor,value]
64
98
  ##
65
99
  ## @return [Boolean] whether the comparison passes or fails
66
100
  ##
67
101
  def evaluate_comp(r, atr)
68
102
  keep = true
69
103
 
104
+ r = r.symbolize_keys
105
+
70
106
  atr.each do |a|
71
107
  key = a[0].to_sym
72
108
  val = if a[2] =~ /^\d+$/
@@ -118,7 +154,7 @@ class ::Hash
118
154
  end
119
155
 
120
156
  ##
121
- ## Test if a hash contains a tag matching filter queries
157
+ ## Test if a tag contains an attribute matching filter queries
122
158
  ##
123
159
  ## @param tag_name [String] The tag name
124
160
  ## @param classes [String] The classes to match
@@ -184,10 +220,26 @@ class ::Hash
184
220
  end
185
221
  end
186
222
 
187
- # Turn all keys into string
223
+ # Turn all keys into symbols
224
+ #
225
+ # If the hash has both a string and a symbol for key,
226
+ # keep the symbol value, discarding the string value
227
+ #
228
+ # @return [Hash] a copy of the hash where all its
229
+ # keys are strings
230
+ #
231
+ def symbolize_keys
232
+ each_with_object({}) do |(k, v), hsh|
233
+ next if k.is_a?(String) && key?(k.to_sym)
234
+
235
+ hsh[k.to_sym] = v.is_a?(Hash) ? v.symbolize_keys : v
236
+ end
237
+ end
238
+
239
+ # Turn all keys into strings
188
240
  #
189
241
  # If the hash has both a string and a symbol for key,
190
- # keep the string value, discarding the symnbol value
242
+ # keep the string value, discarding the symbol value
191
243
  #
192
244
  # @return [Hash] a copy of the hash where all its
193
245
  # keys are strings
data/lib/curly/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Curly
2
- VERSION = '0.0.4'
2
+ VERSION = '0.0.6'
3
3
  end
data/src/_README.md CHANGED
@@ -10,7 +10,7 @@ _If you find this useful, feel free to [buy me some coffee][donate]._
10
10
  [donate]: https://brettterpstra.com/donate
11
11
  <!--END GITHUB-->
12
12
 
13
- The current version of `curlyq` is <!--VER-->0.0.3<!--END VER-->.
13
+ The current version of `curlyq` is <!--VER-->0.0.4<!--END VER-->.
14
14
 
15
15
  CurlyQ is a utility that provides a simple interface for curl, with additional features for things like extracting images and links, finding elements by CSS selector or XPath, getting detailed header info, and more. It's designed to be part of a scripting pipeline, outputting everything as structured data (JSON or YAML). It also has rudimentary support for making calls to JSON endpoints easier, but it's expected that you'll use something like `jq` to parse the output.
16
16
 
@@ -39,12 +39,41 @@ Run `curlyq help` for a list of subcommands. Run `curlyq help SUBCOMMAND` for de
39
39
  @cli(bundle exec bin/curlyq help)
40
40
  ```
41
41
 
42
+ ### Query and Search syntax
43
+
44
+ You can shape the results using `--search` (`-s`) and `--query` (`-q`) on some commands.
45
+
46
+ A search uses either CSS or XPath syntax to locate elements. For example, if you wanted to locate all of the `<article>` elements with a class of `post` inside of the div with an id of `main`, you would run `--search '#main article.post'`. Searches can target tags, ids, and classes, and can accept `>` to target direct descendents. You can also use XPaths, but I hate those so I'm not going to document them.
47
+
48
+ Queries are specifically for shaping CurlyQ output. If you're using the `html` command, it returns a key called `images`, so you can target just the images in the response with `-q 'images'`. The queries accept array syntax, so to get the first image, you would use `-q 'images[0]'`. Ranges are accepted as well, so `-q 'images[1..4]'` will return the 2nd through 5th images found on the page. You can also do comparisons, e.g. `images[rel=me]'` to target only images with a `rel` attribute of `me`.
49
+
50
+ The comparisons for the query flag are:
51
+
52
+ - `<` less than
53
+ - `>` greater than
54
+ - `<=` less than or equal to
55
+ - `>=` greater than or equal to
56
+ - `=` or `==` is equal to
57
+ - `*=` contains text
58
+ - `^=` starts with text
59
+ - `$=` ends with text
60
+
42
61
  #### Commands
43
62
 
44
63
  curlyq makes use of subcommands, e.g. `curlyq html [options] URL` or `curlyq extract [options] URL`. Each subcommand takes its own options, but I've made an effort to standardize the choices between each command as much as possible.
45
64
 
46
65
  ##### extract
47
66
 
67
+ Example:
68
+
69
+ curlyq extract -i -b 'Adding' -a 'accessing the source.' 'https://stackoverflow.com/questions/52428409/get-fully-rendered-html-using-selenium-webdriver-and-python'
70
+
71
+ [
72
+ "Adding <code>time.sleep(10)</code> in various places in case the page had not fully loaded when I was accessing the source."
73
+ ]
74
+
75
+ This specifies a before and after string and includes them (`-i`) in the result.
76
+
48
77
  ```
49
78
  @cli(bundle exec bin/curlyq help extract)
50
79
  ```
@@ -52,36 +81,212 @@ curlyq makes use of subcommands, e.g. `curlyq html [options] URL` or `curlyq ext
52
81
 
53
82
  ##### headlinks
54
83
 
84
+ Example:
85
+
86
+ curlyq headlinks -q '[rel=stylesheet]' https://brettterpstra.com
87
+
88
+ {
89
+ "rel": "stylesheet",
90
+ "href": "https://cdn3.brettterpstra.com/stylesheets/screen.7261.css",
91
+ "type": "text/css",
92
+ "title": null
93
+ }
94
+
95
+ This pulls all `<links>` from the `<head>` of the page, and uses a query `-q` to only show links with `rel="stylesheet"`.
96
+
55
97
  ```
56
98
  @cli(bundle exec bin/curlyq help headlinks)
57
99
  ```
58
100
 
59
101
  ##### html
60
102
 
103
+ The html command (aliased as `curl`) gets the entire text of the web page and provides a JSON response with a breakdown of:
104
+
105
+ - URL, after any redirects
106
+ - Response code
107
+ - Response headers as a keyed hash
108
+ - Meta elements for the page as a keyed hash
109
+ - All meta links in the head as an array of objects containing (as available):
110
+ - rel
111
+ - href
112
+ - type
113
+ - title
114
+ - source of `<head>`
115
+ - source of `<body>`
116
+ - the page title (determined first by og:title, then by a title tag)
117
+ - description (using og:description first)
118
+ - All links on the page as an array of objects with:
119
+ - href
120
+ - title
121
+ - rel
122
+ - text content
123
+ - classes as array
124
+ - All images on the page as an array of objects containing:
125
+ - class
126
+ - all attributes as key/value pairs
127
+ - width and height (if specified)
128
+ - src
129
+ - alt and title
130
+
131
+ You can add a query (`-q`) to only get the information needed, e.g. `-q images[width>600]`.
132
+
133
+ Example:
134
+
135
+ curlyq html -s '#main article .aligncenter' -q 'images[1]' 'https://brettterpstra.com'
136
+
137
+ [
138
+ {
139
+ "class": "aligncenter",
140
+ "original": "https://cdn3.brettterpstra.com/uploads/2023/09/giveaway-keyboardmaestro2024-rb_tw.jpg",
141
+ "at2x": "https://cdn3.brettterpstra.com/uploads/2023/09/giveaway-keyboardmaestro2024-rb@2x.jpg",
142
+ "width": "800",
143
+ "height": "226",
144
+ "src": "https://cdn3.brettterpstra.com/uploads/2023/09/giveaway-keyboardmaestro2024-rb.jpg",
145
+ "alt": "Giveaway Robot with Keyboard Maestro icon",
146
+ "title": "Giveaway Robot with Keyboard Maestro icon"
147
+ }
148
+ ]
149
+
150
+ The above example queries the full html of the page, but narrows the elements using `--search` and then takes the 2nd image from the results.
151
+
152
+ curlyq html -q 'meta.title' https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/
153
+
154
+ Introducing CurlyQ, a pipeline-oriented curl helper - BrettTerpstra.com
155
+
156
+ The above example curls the page and returns the title attribute found in the meta (`-q 'meta.title'`).
157
+
61
158
  ```
62
159
  @cli(bundle exec bin/curlyq help html)
63
160
  ```
64
161
 
65
162
  ##### images
66
163
 
164
+ The images command returns only the images on the page as an array of objects. It can be queried to match certain requirements (see Query and Search syntax above).
165
+
166
+ The base command will return all images on the page, including OpenGraph images from the head, `<img>` tags from the body, and `<srcset>` tags along with their child images.
167
+
168
+ OpenGraph images will be returned with the structure:
169
+
170
+ {
171
+ "type": "opengraph",
172
+ "attrs": null,
173
+ "src": "https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb_tw.jpg"
174
+ }
175
+
176
+ `img` tags will be returned with the structure:
177
+
178
+ {
179
+ "type": "img",
180
+ "src": "https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb.jpg",
181
+ "width": "800",
182
+ "height": "226",
183
+ "alt": "Banner image for CurlyQ",
184
+ "title": "CurlyQ, curl better",
185
+ "attrs": [
186
+ {
187
+ "class": [
188
+ "aligncenter"
189
+ ], // all attributes included
190
+ }
191
+ ]
192
+ }
193
+
194
+
195
+
196
+ `srcset` images will be returned with the structure:
197
+
198
+ {
199
+ "type": "srcset",
200
+ "attrs": [
201
+ {
202
+ "key": "srcset",
203
+ "value": "https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb_tw.jpg 1x, https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb@2x.jpg 2x"
204
+ }
205
+ ],
206
+ "images": [
207
+ {
208
+ "src": "https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb_tw.jpg",
209
+ "media": "1x"
210
+ },
211
+ {
212
+ "src": "https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb@2x.jpg",
213
+ "media": "2x"
214
+ }
215
+ ]
216
+ }
217
+ }
218
+
219
+ Example:
220
+
221
+ curlyq images -t img -q '[alt$=screenshot]' https://brettterpstra.com
222
+
223
+ This will return an array of images that are `<img>` tags, and only show the ones that have an `alt` attribute that ends with `screenshot`.
224
+
225
+ curlyq images -q '[width>750]' https://brettterpstra.com
226
+
227
+ This example will only return images that have a width greater than 750 pixels. This query depends on the images having proper `width` attributes set on them in the source.
228
+
67
229
  ```
68
230
  @cli(bundle exec bin/curlyq help images)
69
231
  ```
70
232
 
71
233
  ##### json
72
234
 
235
+ The `json` command just returns an object with header/response info, and the contents of the JSON response after it's been read by the Ruby JSON library and output. If there are fetching or parsing errors it will fail gracefully with an error code.
236
+
73
237
  ```
74
238
  @cli(bundle exec bin/curlyq help json)
75
239
  ```
76
240
 
77
241
  ##### links
78
242
 
243
+ Returns all the links on the page, which can be queried on any attribute.
244
+
245
+ Example:
246
+
247
+ curlyq links -q '[content*=twitter]' 'https://stackoverflow.com/questions/52428409/get-fully-rendered-html-using-selenium-webdriver-and-python'
248
+
249
+ [
250
+ {
251
+ "href": "https://twitter.com/stackoverflow",
252
+ "title": null,
253
+ "rel": null,
254
+ "content": "Twitter",
255
+ "class": [
256
+ "-link",
257
+ "js-gps-track"
258
+ ]
259
+ }
260
+ ]
261
+
262
+ This example gets all links from the page but only returns ones with link content containing 'twitter' (`-q '[content*=twitter]'`).
263
+
79
264
  ```
80
265
  @cli(bundle exec bin/curlyq help links)
81
266
  ```
82
267
 
83
268
  ##### scrape
84
269
 
270
+ Loads the page in a web browser, allowing scraping of dynamically loaded pages that return nothing but scripts when `curl`ed. The `-b` (`--browser`) option is required and should be 'chrome' or 'firefox' (or just 'c' or 'f'). The selected browser must be installed on your system.
271
+
272
+ Example:
273
+
274
+ curlyq scrape -b firefox -q 'links[rel=me&content*=mastodon][0]' https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/
275
+
276
+ {
277
+ "href": "https://nojack.easydns.ca/@ttscoff",
278
+ "title": null,
279
+ "rel": [
280
+ "me"
281
+ ],
282
+ "content": "Mastodon",
283
+ "class": [
284
+ "u-url"
285
+ ]
286
+ }
287
+
288
+ This example scrapes the page using firefox and finds the first link with a rel of 'me' and text containing 'mastodon'.
289
+
85
290
  ```
86
291
  @cli(bundle exec bin/curlyq help scrape)
87
292
  ```
@@ -90,12 +295,45 @@ curlyq makes use of subcommands, e.g. `curlyq html [options] URL` or `curlyq ext
90
295
 
91
296
  Full-page screenshots require Firefox, installed and specified with `--browser firefox`.
92
297
 
298
+ Type defaults to `full`, but will only work if `-b` is Firefox. If you want to use Chrome, you must specify a `--type` as 'visible' or 'print'.
299
+
300
+ The `-o` (`--output`) flag is required. It should be a path to a target PNG file (or PDF for `-t print` output). Extension will be modified automatically, all you need is the base name.
301
+
302
+ Example:
303
+
304
+ curlyq screenshot -b f -o ~/Desktop/test https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/
305
+
306
+ Screenshot saved to /Users/ttscoff/Desktop/test.png
307
+
308
+
93
309
  ```
94
310
  @cli(bundle exec bin/curlyq help screenshot)
95
311
  ```
96
312
 
97
313
  ##### tags
98
314
 
315
+ Return a hierarchy of all tags in a page. Use `-t` to limit to a specific tag.
316
+
317
+ curlyq tags --search '#main .post h3' -q 'attrs[id*=what]' https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/
318
+
319
+ [
320
+ {
321
+ "tag": "h3",
322
+ "source": "<h3 id=\"whats-next\">What’s Next</h3>",
323
+ "attrs": [
324
+ {
325
+ "id": "whats-next"
326
+ }
327
+ ],
328
+ "content": "What’s Next",
329
+ "tags": [
330
+
331
+ ]
332
+ }
333
+ ]
334
+
335
+ The above command filters the tags based on a CSS query, then further filters them to just tags with an id containing 'what'.
336
+
99
337
  ```
100
338
  @cli(bundle exec bin/curlyq help tags)
101
339
  ```