curlyq 0.0.4 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.irbrc +4 -0
- data/CHANGELOG.md +36 -0
- data/Gemfile.lock +30 -7
- data/README.md +260 -17
- data/Rakefile +32 -4
- data/bin/curlyq +114 -52
- data/curlyq.gemspec +7 -3
- data/lib/curly/array.rb +80 -55
- data/lib/curly/curl/html.rb +71 -54
- data/lib/curly/hash.rb +59 -7
- data/lib/curly/version.rb +1 -1
- data/src/_README.md +239 -1
- data/test/curlyq_extract_test.rb +43 -0
- data/test/curlyq_headlinks_test.rb +32 -0
- data/test/curlyq_html_test.rb +25 -0
- data/test/curlyq_images_test.rb +27 -0
- data/test/curlyq_json_test.rb +33 -0
- data/test/curlyq_links_test.rb +20 -0
- data/test/curlyq_scrape_test.rb +22 -0
- data/test/curlyq_tags_test.rb +31 -0
- data/test/helpers/curlyq-helpers.rb +29 -0
- data/test/helpers/fake_std_out.rb +30 -0
- data/test/helpers/threaded_tests.rb +182 -0
- data/test/test_helper.rb +7 -2
- metadata +101 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 376f17c2844a60ca0932187b1fe2c4c3f487c0bc63fbb44abe3062036b8c394f
|
4
|
+
data.tar.gz: 9b84f93db1c13dabc20f33b394506619c77272cccb4d2e85f70f6e74cb6238dd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 909543c28f2192856e0168fdd7da20383892a0c8cde67503209bfbbf69831edd79bdd03884426af0e8a78743d3b7344ce4e70a175c059e4c1356eaf949199d01
|
7
|
+
data.tar.gz: e242883b09cae56ba55df004afe9febba753cdd21d46639b328e0df73322eb18d37187ecdbfc47030ff84c0d46540b4ee7acabffd7aecf4a28facfc4af8e0a24
|
data/.gitignore
CHANGED
data/.irbrc
ADDED
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,39 @@
|
|
1
|
+
### 0.0.6
|
2
|
+
|
3
|
+
2024-01-12 14:44
|
4
|
+
|
5
|
+
#### CHANGED
|
6
|
+
|
7
|
+
- Attributes array is now a hash directly keyed to the attribute key
|
8
|
+
|
9
|
+
#### NEW
|
10
|
+
|
11
|
+
- Tags command has option to output only raw html of matched tags
|
12
|
+
|
13
|
+
#### FIXED
|
14
|
+
|
15
|
+
- --query works with --search on scrape and tags command
|
16
|
+
- Json command dot query works now
|
17
|
+
|
18
|
+
### 0.0.5
|
19
|
+
|
20
|
+
2024-01-11 18:06
|
21
|
+
|
22
|
+
#### IMPROVED
|
23
|
+
|
24
|
+
- Add --query capabilities to images command
|
25
|
+
- Add --query to links command
|
26
|
+
- Allow hyphens in query syntax
|
27
|
+
- Allow any character other than comma, ampersand, or right square bracket in query value
|
28
|
+
|
29
|
+
#### FIXED
|
30
|
+
|
31
|
+
- Html --search returns a full Curl::Html object
|
32
|
+
- --query works better with --search and is consistent with other query functions
|
33
|
+
- Scrape command outputting malformed data
|
34
|
+
- Hash output when --query is used with scrape
|
35
|
+
- Nil match on tags command
|
36
|
+
|
1
37
|
### 0.0.4
|
2
38
|
|
3
39
|
2024-01-10 13:54
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
curlyq (0.0.
|
4
|
+
curlyq (0.0.6)
|
5
5
|
gli (~> 2.21.0)
|
6
6
|
nokogiri (~> 1.16.0)
|
7
7
|
selenium-webdriver (~> 4.16.0)
|
@@ -11,19 +11,38 @@ GEM
|
|
11
11
|
remote: https://rubygems.org/
|
12
12
|
specs:
|
13
13
|
gli (2.21.1)
|
14
|
-
minitest (5.16.3)
|
15
14
|
nokogiri (1.16.0-arm64-darwin)
|
16
15
|
racc (~> 1.4)
|
16
|
+
parallel (1.23.0)
|
17
|
+
parallel_tests (3.13.0)
|
18
|
+
parallel
|
19
|
+
pastel (0.8.0)
|
20
|
+
tty-color (~> 0.5)
|
21
|
+
power_assert (2.0.3)
|
17
22
|
racc (1.7.3)
|
18
|
-
rake (
|
19
|
-
rdoc (
|
23
|
+
rake (13.1.0)
|
24
|
+
rdoc (6.3.3)
|
20
25
|
rexml (3.2.6)
|
21
26
|
rubyzip (2.3.2)
|
22
27
|
selenium-webdriver (4.16.0)
|
23
28
|
rexml (~> 3.2, >= 3.2.5)
|
24
29
|
rubyzip (>= 1.2.2, < 3.0)
|
25
30
|
websocket (~> 1.0)
|
31
|
+
strings-ansi (0.2.0)
|
32
|
+
test-unit (3.4.9)
|
33
|
+
power_assert
|
34
|
+
tty-color (0.6.0)
|
35
|
+
tty-cursor (0.7.1)
|
36
|
+
tty-progressbar (0.18.2)
|
37
|
+
strings-ansi (~> 0.2)
|
38
|
+
tty-cursor (~> 0.7)
|
39
|
+
tty-screen (~> 0.8)
|
40
|
+
unicode-display_width (>= 1.6, < 3.0)
|
41
|
+
tty-screen (0.8.2)
|
42
|
+
tty-spinner (0.9.3)
|
43
|
+
tty-cursor (~> 0.7)
|
26
44
|
tty-which (0.5.0)
|
45
|
+
unicode-display_width (2.5.0)
|
27
46
|
websocket (1.2.10)
|
28
47
|
yard (0.9.34)
|
29
48
|
|
@@ -32,9 +51,13 @@ PLATFORMS
|
|
32
51
|
|
33
52
|
DEPENDENCIES
|
34
53
|
curlyq!
|
35
|
-
|
36
|
-
|
37
|
-
|
54
|
+
parallel_tests (~> 3.7, >= 3.7.3)
|
55
|
+
pastel (~> 0.8.0)
|
56
|
+
rake (~> 13.0, >= 13.0.1)
|
57
|
+
rdoc (~> 6.3.1)
|
58
|
+
test-unit (~> 3.4.4)
|
59
|
+
tty-progressbar (~> 0.18, >= 0.18.2)
|
60
|
+
tty-spinner (~> 0.9, >= 0.9.3)
|
38
61
|
yard (~> 0.9, >= 0.9.26)
|
39
62
|
|
40
63
|
BUNDLED WITH
|
data/README.md
CHANGED
@@ -10,7 +10,7 @@ _If you find this useful, feel free to [buy me some coffee][donate]._
|
|
10
10
|
[donate]: https://brettterpstra.com/donate
|
11
11
|
|
12
12
|
|
13
|
-
The current version of `curlyq` is 0.0.
|
13
|
+
The current version of `curlyq` is 0.0.6
|
14
14
|
.
|
15
15
|
|
16
16
|
CurlyQ is a utility that provides a simple interface for curl, with additional features for things like extracting images and links, finding elements by CSS selector or XPath, getting detailed header info, and more. It's designed to be part of a scripting pipeline, outputting everything as structured data (JSON or YAML). It also has rudimentary support for making calls to JSON endpoints easier, but it's expected that you'll use something like `jq` to parse the output.
|
@@ -44,7 +44,7 @@ SYNOPSIS
|
|
44
44
|
curlyq [global options] command [command options] [arguments...]
|
45
45
|
|
46
46
|
VERSION
|
47
|
-
0.0.
|
47
|
+
0.0.6
|
48
48
|
|
49
49
|
GLOBAL OPTIONS
|
50
50
|
--help - Show this message
|
@@ -65,12 +65,41 @@ COMMANDS
|
|
65
65
|
tags - Extract all instances of a tag
|
66
66
|
```
|
67
67
|
|
68
|
+
### Query and Search syntax
|
69
|
+
|
70
|
+
You can shape the results using `--search` (`-s`) and `--query` (`-q`) on some commands.
|
71
|
+
|
72
|
+
A search uses either CSS or XPath syntax to locate elements. For example, if you wanted to locate all of the `<article>` elements with a class of `post` inside of the div with an id of `main`, you would run `--search '#main article.post'`. Searches can target tags, ids, and classes, and can accept `>` to target direct descendents. You can also use XPaths, but I hate those so I'm not going to document them.
|
73
|
+
|
74
|
+
Queries are specifically for shaping CurlyQ output. If you're using the `html` command, it returns a key called `images`, so you can target just the images in the response with `-q 'images'`. The queries accept array syntax, so to get the first image, you would use `-q 'images[0]'`. Ranges are accepted as well, so `-q 'images[1..4]'` will return the 2nd through 5th images found on the page. You can also do comparisons, e.g. `images[rel=me]'` to target only images with a `rel` attribute of `me`.
|
75
|
+
|
76
|
+
The comparisons for the query flag are:
|
77
|
+
|
78
|
+
- `<` less than
|
79
|
+
- `>` greater than
|
80
|
+
- `<=` less than or equal to
|
81
|
+
- `>=` greater than or equal to
|
82
|
+
- `=` or `==` is equal to
|
83
|
+
- `*=` contains text
|
84
|
+
- `^=` starts with text
|
85
|
+
- `$=` ends with text
|
86
|
+
|
68
87
|
#### Commands
|
69
88
|
|
70
89
|
curlyq makes use of subcommands, e.g. `curlyq html [options] URL` or `curlyq extract [options] URL`. Each subcommand takes its own options, but I've made an effort to standardize the choices between each command as much as possible.
|
71
90
|
|
72
91
|
##### extract
|
73
92
|
|
93
|
+
Example:
|
94
|
+
|
95
|
+
curlyq extract -i -b 'Adding' -a 'accessing the source.' 'https://stackoverflow.com/questions/52428409/get-fully-rendered-html-using-selenium-webdriver-and-python'
|
96
|
+
|
97
|
+
[
|
98
|
+
"Adding <code>time.sleep(10)</code> in various places in case the page had not fully loaded when I was accessing the source."
|
99
|
+
]
|
100
|
+
|
101
|
+
This specifies a before and after string and includes them (`-i`) in the result.
|
102
|
+
|
74
103
|
```
|
75
104
|
NAME
|
76
105
|
extract - Extract contents between two regular expressions
|
@@ -80,17 +109,32 @@ SYNOPSIS
|
|
80
109
|
curlyq [global options] extract [command options] URL...
|
81
110
|
|
82
111
|
COMMAND OPTIONS
|
83
|
-
-a, --after=arg - Text after extraction
|
84
|
-
-b, --before=arg - Text before extraction
|
112
|
+
-a, --after=arg - Text after extraction (default: none)
|
113
|
+
-b, --before=arg - Text before extraction (default: none)
|
85
114
|
-c, --[no-]compressed - Expect compressed results
|
86
115
|
--[no-]clean - Remove extra whitespace from results
|
87
116
|
-h, --header=arg - Define a header to send as key=value (may be used more than once, default: none)
|
117
|
+
-i, --[no-]include - Include the before/after matches in the result
|
118
|
+
-r, --[no-]regex - Process before/after strings as regular expressions
|
88
119
|
--[no-]strip - Strip HTML tags from results
|
89
120
|
```
|
90
121
|
|
91
122
|
|
92
123
|
##### headlinks
|
93
124
|
|
125
|
+
Example:
|
126
|
+
|
127
|
+
curlyq headlinks -q '[rel=stylesheet]' https://brettterpstra.com
|
128
|
+
|
129
|
+
{
|
130
|
+
"rel": "stylesheet",
|
131
|
+
"href": "https://cdn3.brettterpstra.com/stylesheets/screen.7261.css",
|
132
|
+
"type": "text/css",
|
133
|
+
"title": null
|
134
|
+
}
|
135
|
+
|
136
|
+
This pulls all `<links>` from the `<head>` of the page, and uses a query `-q` to only show links with `rel="stylesheet"`.
|
137
|
+
|
94
138
|
```
|
95
139
|
NAME
|
96
140
|
headlinks - Return all <head> links on URL's page
|
@@ -105,6 +149,61 @@ COMMAND OPTIONS
|
|
105
149
|
|
106
150
|
##### html
|
107
151
|
|
152
|
+
The html command (aliased as `curl`) gets the entire text of the web page and provides a JSON response with a breakdown of:
|
153
|
+
|
154
|
+
- URL, after any redirects
|
155
|
+
- Response code
|
156
|
+
- Response headers as a keyed hash
|
157
|
+
- Meta elements for the page as a keyed hash
|
158
|
+
- All meta links in the head as an array of objects containing (as available):
|
159
|
+
- rel
|
160
|
+
- href
|
161
|
+
- type
|
162
|
+
- title
|
163
|
+
- source of `<head>`
|
164
|
+
- source of `<body>`
|
165
|
+
- the page title (determined first by og:title, then by a title tag)
|
166
|
+
- description (using og:description first)
|
167
|
+
- All links on the page as an array of objects with:
|
168
|
+
- href
|
169
|
+
- title
|
170
|
+
- rel
|
171
|
+
- text content
|
172
|
+
- classes as array
|
173
|
+
- All images on the page as an array of objects containing:
|
174
|
+
- class
|
175
|
+
- all attributes as key/value pairs
|
176
|
+
- width and height (if specified)
|
177
|
+
- src
|
178
|
+
- alt and title
|
179
|
+
|
180
|
+
You can add a query (`-q`) to only get the information needed, e.g. `-q images[width>600]`.
|
181
|
+
|
182
|
+
Example:
|
183
|
+
|
184
|
+
curlyq html -s '#main article .aligncenter' -q 'images[1]' 'https://brettterpstra.com'
|
185
|
+
|
186
|
+
[
|
187
|
+
{
|
188
|
+
"class": "aligncenter",
|
189
|
+
"original": "https://cdn3.brettterpstra.com/uploads/2023/09/giveaway-keyboardmaestro2024-rb_tw.jpg",
|
190
|
+
"at2x": "https://cdn3.brettterpstra.com/uploads/2023/09/giveaway-keyboardmaestro2024-rb@2x.jpg",
|
191
|
+
"width": "800",
|
192
|
+
"height": "226",
|
193
|
+
"src": "https://cdn3.brettterpstra.com/uploads/2023/09/giveaway-keyboardmaestro2024-rb.jpg",
|
194
|
+
"alt": "Giveaway Robot with Keyboard Maestro icon",
|
195
|
+
"title": "Giveaway Robot with Keyboard Maestro icon"
|
196
|
+
}
|
197
|
+
]
|
198
|
+
|
199
|
+
The above example queries the full html of the page, but narrows the elements using `--search` and then takes the 2nd image from the results.
|
200
|
+
|
201
|
+
curlyq html -q 'meta.title' https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/
|
202
|
+
|
203
|
+
Introducing CurlyQ, a pipeline-oriented curl helper - BrettTerpstra.com
|
204
|
+
|
205
|
+
The above example curls the page and returns the title attribute found in the meta (`-q 'meta.title'`).
|
206
|
+
|
108
207
|
```
|
109
208
|
NAME
|
110
209
|
html - Curl URL and output its elements, multiple URLs allowed
|
@@ -124,12 +223,77 @@ COMMAND OPTIONS
|
|
124
223
|
--[no-]ignore_relative - Ignore relative hrefs when gathering content links
|
125
224
|
-q, --query, --filter=arg - Filter output using dot-syntax path (default: none)
|
126
225
|
-r, --raw=arg - Output a raw value for a key (default: none)
|
127
|
-
--search=arg
|
226
|
+
-s, --search=arg - Regurn an array of matches to a CSS or XPath query (default: none)
|
128
227
|
-x, --external_links_only - Only gather external links
|
129
228
|
```
|
130
229
|
|
131
230
|
##### images
|
132
231
|
|
232
|
+
The images command returns only the images on the page as an array of objects. It can be queried to match certain requirements (see Query and Search syntax above).
|
233
|
+
|
234
|
+
The base command will return all images on the page, including OpenGraph images from the head, `<img>` tags from the body, and `<srcset>` tags along with their child images.
|
235
|
+
|
236
|
+
OpenGraph images will be returned with the structure:
|
237
|
+
|
238
|
+
{
|
239
|
+
"type": "opengraph",
|
240
|
+
"attrs": null,
|
241
|
+
"src": "https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb_tw.jpg"
|
242
|
+
}
|
243
|
+
|
244
|
+
`img` tags will be returned with the structure:
|
245
|
+
|
246
|
+
{
|
247
|
+
"type": "img",
|
248
|
+
"src": "https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb.jpg",
|
249
|
+
"width": "800",
|
250
|
+
"height": "226",
|
251
|
+
"alt": "Banner image for CurlyQ",
|
252
|
+
"title": "CurlyQ, curl better",
|
253
|
+
"attrs": [
|
254
|
+
{
|
255
|
+
"class": [
|
256
|
+
"aligncenter"
|
257
|
+
], // all attributes included
|
258
|
+
}
|
259
|
+
]
|
260
|
+
}
|
261
|
+
|
262
|
+
|
263
|
+
|
264
|
+
`srcset` images will be returned with the structure:
|
265
|
+
|
266
|
+
{
|
267
|
+
"type": "srcset",
|
268
|
+
"attrs": [
|
269
|
+
{
|
270
|
+
"key": "srcset",
|
271
|
+
"value": "https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb_tw.jpg 1x, https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb@2x.jpg 2x"
|
272
|
+
}
|
273
|
+
],
|
274
|
+
"images": [
|
275
|
+
{
|
276
|
+
"src": "https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb_tw.jpg",
|
277
|
+
"media": "1x"
|
278
|
+
},
|
279
|
+
{
|
280
|
+
"src": "https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb@2x.jpg",
|
281
|
+
"media": "2x"
|
282
|
+
}
|
283
|
+
]
|
284
|
+
}
|
285
|
+
}
|
286
|
+
|
287
|
+
Example:
|
288
|
+
|
289
|
+
curlyq images -t img -q '[alt$=screenshot]' https://brettterpstra.com
|
290
|
+
|
291
|
+
This will return an array of images that are `<img>` tags, and only show the ones that have an `alt` attribute that ends with `screenshot`.
|
292
|
+
|
293
|
+
curlyq images -q '[width>750]' https://brettterpstra.com
|
294
|
+
|
295
|
+
This example will only return images that have a width greater than 750 pixels. This query depends on the images having proper `width` attributes set on them in the source.
|
296
|
+
|
133
297
|
```
|
134
298
|
NAME
|
135
299
|
images - Extract all images from a URL
|
@@ -139,14 +303,17 @@ SYNOPSIS
|
|
139
303
|
curlyq [global options] images [command options] URL...
|
140
304
|
|
141
305
|
COMMAND OPTIONS
|
142
|
-
-c, --[no-]compressed
|
143
|
-
--[no-]clean
|
144
|
-
-h, --header=arg
|
145
|
-
-
|
306
|
+
-c, --[no-]compressed - Expect compressed results
|
307
|
+
--[no-]clean - Remove extra whitespace from results
|
308
|
+
-h, --header=arg - Define a header to send as key=value (may be used more than once, default: none)
|
309
|
+
-q, --query, --filter=arg - Filter output using dot-syntax path (default: none)
|
310
|
+
-t, --type=arg - Type of images to return (img, srcset, opengraph, all) (may be used more than once, default: ["all"])
|
146
311
|
```
|
147
312
|
|
148
313
|
##### json
|
149
314
|
|
315
|
+
The `json` command just returns an object with header/response info, and the contents of the JSON response after it's been read by the Ruby JSON library and output. If there are fetching or parsing errors it will fail gracefully with an error code.
|
316
|
+
|
150
317
|
```
|
151
318
|
NAME
|
152
319
|
json - Get a JSON response from a URL, multiple URLs allowed
|
@@ -163,6 +330,27 @@ COMMAND OPTIONS
|
|
163
330
|
|
164
331
|
##### links
|
165
332
|
|
333
|
+
Returns all the links on the page, which can be queried on any attribute.
|
334
|
+
|
335
|
+
Example:
|
336
|
+
|
337
|
+
curlyq links -q '[content*=twitter]' 'https://stackoverflow.com/questions/52428409/get-fully-rendered-html-using-selenium-webdriver-and-python'
|
338
|
+
|
339
|
+
[
|
340
|
+
{
|
341
|
+
"href": "https://twitter.com/stackoverflow",
|
342
|
+
"title": null,
|
343
|
+
"rel": null,
|
344
|
+
"content": "Twitter",
|
345
|
+
"class": [
|
346
|
+
"-link",
|
347
|
+
"js-gps-track"
|
348
|
+
]
|
349
|
+
}
|
350
|
+
]
|
351
|
+
|
352
|
+
This example gets all links from the page but only returns ones with link content containing 'twitter' (`-q '[content*=twitter]'`).
|
353
|
+
|
166
354
|
```
|
167
355
|
NAME
|
168
356
|
links - Return all links on a URL's page
|
@@ -181,6 +369,26 @@ COMMAND OPTIONS
|
|
181
369
|
|
182
370
|
##### scrape
|
183
371
|
|
372
|
+
Loads the page in a web browser, allowing scraping of dynamically loaded pages that return nothing but scripts when `curl`ed. The `-b` (`--browser`) option is required and should be 'chrome' or 'firefox' (or just 'c' or 'f'). The selected browser must be installed on your system.
|
373
|
+
|
374
|
+
Example:
|
375
|
+
|
376
|
+
curlyq scrape -b firefox -q 'links[rel=me&content*=mastodon][0]' https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/
|
377
|
+
|
378
|
+
{
|
379
|
+
"href": "https://nojack.easydns.ca/@ttscoff",
|
380
|
+
"title": null,
|
381
|
+
"rel": [
|
382
|
+
"me"
|
383
|
+
],
|
384
|
+
"content": "Mastodon",
|
385
|
+
"class": [
|
386
|
+
"u-url"
|
387
|
+
]
|
388
|
+
}
|
389
|
+
|
390
|
+
This example scrapes the page using firefox and finds the first link with a rel of 'me' and text containing 'mastodon'.
|
391
|
+
|
184
392
|
```
|
185
393
|
NAME
|
186
394
|
scrape - Scrape a page using a web browser, for dynamic (JS) pages. Be sure to have the selected --browser installed.
|
@@ -190,7 +398,7 @@ SYNOPSIS
|
|
190
398
|
curlyq [global options] scrape [command options] URL...
|
191
399
|
|
192
400
|
COMMAND OPTIONS
|
193
|
-
-b, --browser=arg - Browser to use (firefox, chrome) (default: none)
|
401
|
+
-b, --browser=arg - Browser to use (firefox, chrome) (required, default: none)
|
194
402
|
--[no-]clean - Remove extra whitespace from results
|
195
403
|
-h, --header=arg - Define a header to send as "key=value" (may be used more than once, default: none)
|
196
404
|
-q, --query, --filter=arg - Filter output using dot-syntax path (default: none)
|
@@ -202,6 +410,17 @@ COMMAND OPTIONS
|
|
202
410
|
|
203
411
|
Full-page screenshots require Firefox, installed and specified with `--browser firefox`.
|
204
412
|
|
413
|
+
Type defaults to `full`, but will only work if `-b` is Firefox. If you want to use Chrome, you must specify a `--type` as 'visible' or 'print'.
|
414
|
+
|
415
|
+
The `-o` (`--output`) flag is required. It should be a path to a target PNG file (or PDF for `-t print` output). Extension will be modified automatically, all you need is the base name.
|
416
|
+
|
417
|
+
Example:
|
418
|
+
|
419
|
+
curlyq screenshot -b f -o ~/Desktop/test https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/
|
420
|
+
|
421
|
+
Screenshot saved to /Users/ttscoff/Desktop/test.png
|
422
|
+
|
423
|
+
|
205
424
|
```
|
206
425
|
NAME
|
207
426
|
screenshot - Save a screenshot of a URL
|
@@ -213,12 +432,34 @@ SYNOPSIS
|
|
213
432
|
COMMAND OPTIONS
|
214
433
|
-b, --browser=arg - Browser to use (firefox, chrome) (default: chrome)
|
215
434
|
-h, --header=arg - Define a header to send as key=value (may be used more than once, default: none)
|
216
|
-
-o, --out, --file=arg - File destination (default: none)
|
217
|
-
-t, --type=arg - Type of screenshot to save (full (requires firefox), print, visible) (default:
|
435
|
+
-o, --out, --file=arg - File destination (required, default: none)
|
436
|
+
-t, --type=arg - Type of screenshot to save (full (requires firefox), print, visible) (default: visible)
|
218
437
|
```
|
219
438
|
|
220
439
|
##### tags
|
221
440
|
|
441
|
+
Return a hierarchy of all tags in a page. Use `-t` to limit to a specific tag.
|
442
|
+
|
443
|
+
curlyq tags --search '#main .post h3' -q 'attrs[id*=what]' https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/
|
444
|
+
|
445
|
+
[
|
446
|
+
{
|
447
|
+
"tag": "h3",
|
448
|
+
"source": "<h3 id=\"whats-next\">What???s Next</h3>",
|
449
|
+
"attrs": [
|
450
|
+
{
|
451
|
+
"id": "whats-next"
|
452
|
+
}
|
453
|
+
],
|
454
|
+
"content": "What???s Next",
|
455
|
+
"tags": [
|
456
|
+
|
457
|
+
]
|
458
|
+
}
|
459
|
+
]
|
460
|
+
|
461
|
+
The above command filters the tags based on a CSS query, then further filters them to just tags with an id containing 'what'.
|
462
|
+
|
222
463
|
```
|
223
464
|
NAME
|
224
465
|
tags - Extract all instances of a tag
|
@@ -228,11 +469,13 @@ SYNOPSIS
|
|
228
469
|
curlyq [global options] tags [command options] URL...
|
229
470
|
|
230
471
|
COMMAND OPTIONS
|
231
|
-
-c, --[no-]compressed
|
232
|
-
--[no-]clean
|
233
|
-
-h, --header=
|
234
|
-
-q, --query, --
|
235
|
-
|
472
|
+
-c, --[no-]compressed - Expect compressed results
|
473
|
+
--[no-]clean - Remove extra whitespace from results
|
474
|
+
-h, --header=KEY=VAL - Define a header to send as key=value (may be used more than once, default: none)
|
475
|
+
-q, --query, --filter=DOT_SYNTAX - Dot syntax query to filter results (default: none)
|
476
|
+
--search=CSS/XPATH - Regurn an array of matches to a CSS or XPath query (default: none)
|
477
|
+
--[no-]source, --[no-]html - Output the HTML source of the results
|
478
|
+
-t, --tag=TAG - Specify a tag to collect (may be used more than once, default: none)
|
236
479
|
```
|
237
480
|
|
238
481
|
|
data/Rakefile
CHANGED
@@ -1,8 +1,12 @@
|
|
1
1
|
require 'rake/clean'
|
2
|
+
require 'rake/testtask'
|
2
3
|
require 'rubygems'
|
3
4
|
require 'rubygems/package_task'
|
4
5
|
require 'rdoc/task'
|
5
6
|
require 'yard'
|
7
|
+
require 'parallel_tests'
|
8
|
+
require 'parallel_tests/tasks'
|
9
|
+
require 'tty-spinner'
|
6
10
|
|
7
11
|
YARD::Rake::YardocTask.new do |t|
|
8
12
|
t.files = ['lib/curly/*.rb']
|
@@ -22,10 +26,34 @@ spec = eval(File.read('curlyq.gemspec'))
|
|
22
26
|
|
23
27
|
Gem::PackageTask.new(spec) do |pkg|
|
24
28
|
end
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
+
|
30
|
+
namespace :test do
|
31
|
+
FileList['test/*_test.rb'].each do |rakefile|
|
32
|
+
test_name = File.basename(rakefile, '.rb').sub(/^.*?_(.*?)_.*?$/, '\1')
|
33
|
+
|
34
|
+
Rake::TestTask.new(:"#{test_name}") do |t|
|
35
|
+
t.libs << ['test', 'test/helpers']
|
36
|
+
t.pattern = rakefile
|
37
|
+
t.verbose = ENV['VERBOSE'] =~ /(true|1)/i ? true : false
|
38
|
+
end
|
39
|
+
# Define default task for :test
|
40
|
+
task default: test_name
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
desc 'Run one test verbosely'
|
45
|
+
task :test_one, :test do |_, args|
|
46
|
+
args.with_defaults(test: '*')
|
47
|
+
puts `bundle exec rake test TESTOPTS="-v" TEST="test/curlyq_#{args[:test]}_test.rb"`
|
48
|
+
end
|
49
|
+
|
50
|
+
desc 'Run all tests, threaded'
|
51
|
+
task :test, :pattern, :threads, :max_tests do |_, args|
|
52
|
+
args.with_defaults(pattern: '*', threads: 8, max_tests: 0)
|
53
|
+
pattern = args[:pattern] =~ /(n[iu]ll?|0|\.)/i ? '*' : args[:pattern]
|
54
|
+
|
55
|
+
require_relative 'test/helpers/threaded_tests'
|
56
|
+
ThreadedTests.new.run(pattern: pattern, max_threads: args[:threads].to_i, max_tests: args[:max_tests])
|
29
57
|
end
|
30
58
|
|
31
59
|
desc 'Development version check'
|