curlyq 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/curlyq.rdoc ADDED
@@ -0,0 +1,355 @@
1
+ == curlyq - A scriptable interface to curl
2
+
3
+ v0.0.1
4
+
5
+ === Global Options
6
+ === --help
7
+ Show this message
8
+
9
+
10
+
11
+ === --[no-]pretty
12
+ Output "pretty" JSON
13
+
14
+
15
+
16
+ === --version
17
+ Display the program version
18
+
19
+
20
+
21
+ === -y|--[no-]yaml
22
+ Output YAML instead of json
23
+
24
+
25
+
26
+ === Commands
27
+ ==== Command: <tt>extract URL...</tt>
28
+ Extract contents between two regular expressions
29
+
30
+
31
+ ===== Options
32
+ ===== -a|--after arg
33
+
34
+ Text after extraction, parsed as regex
35
+
36
+ [Default Value] None
37
+
38
+
39
+ ===== -b|--before arg
40
+
41
+ Text before extraction, parsed as regex
42
+
43
+ [Default Value] None
44
+
45
+
46
+ ===== -h|--header arg
47
+
48
+ Define a header to send as key=value
49
+
50
+ [Default Value] None
51
+
52
+
53
+ ===== -c|--[no-]compressed
54
+ Expect compressed results
55
+
56
+
57
+
58
+ ===== --[no-]clean
59
+ Remove extra whitespace from results
60
+
61
+
62
+
63
+ ===== --[no-]strip
64
+ Strip HTML tags from results
65
+
66
+
67
+
68
+ ==== Command: <tt>headlinks URL...</tt>
69
+ Return all <head> links on URL's page
70
+
71
+
72
+ ===== Options
73
+ ===== -q|--query|--filter arg
74
+
75
+ Filter output using dot-syntax path
76
+
77
+ [Default Value] None
78
+
79
+
80
+ ==== Command: <tt>help command</tt>
81
+ Shows a list of commands or help for one command
82
+
83
+ Gets help for the application or its commands. Can also list the commands in a way helpful to creating a bash-style completion function
84
+ ===== Options
85
+ ===== -c
86
+ List commands one per line, to assist with shell completion
87
+
88
+
89
+
90
+ ==== Command: <tt>html|curl URL...</tt>
91
+ Curl URL and output its elements, multiple URLs allowed
92
+
93
+
94
+ ===== Options
95
+ ===== -b|--browser arg
96
+
97
+ Use a browser to retrieve a dynamic web page (firefox, chrome)
98
+
99
+ [Default Value] None
100
+ [Must Match] (?-mix:^[fc].*?$)
101
+
102
+
103
+ ===== -f|--fallback arg
104
+
105
+ If curl doesn't work, use a fallback browser (firefox, chrome)
106
+
107
+ [Default Value] None
108
+ [Must Match] (?-mix:^[fc].*?$)
109
+
110
+
111
+ ===== -h|--header arg
112
+
113
+ Define a header to send as "key=value"
114
+
115
+ [Default Value] None
116
+
117
+
118
+ ===== -q|--query|--filter arg
119
+
120
+ Filter output using dot-syntax path
121
+
122
+ [Default Value] None
123
+
124
+
125
+ ===== -r|--raw arg
126
+
127
+ Output a raw value for a key
128
+
129
+ [Default Value] None
130
+
131
+
132
+ ===== --search arg
133
+
134
+ Regurn an array of matches to a CSS or XPath query
135
+
136
+ [Default Value] None
137
+
138
+
139
+ ===== -I|--info
140
+ Only retrieve headers/info
141
+
142
+
143
+
144
+ ===== -c|--compressed
145
+ Expect compressed results
146
+
147
+
148
+
149
+ ===== --[no-]clean
150
+ Remove extra whitespace from results
151
+
152
+
153
+
154
+ ===== --[no-]ignore_fragments
155
+ Ignore fragment hrefs when gathering content links
156
+
157
+
158
+
159
+ ===== --[no-]ignore_relative
160
+ Ignore relative hrefs when gathering content links
161
+
162
+
163
+
164
+ ===== -x|--external_links_only
165
+ Only gather external links
166
+
167
+
168
+
169
+ ==== Command: <tt>images URL...</tt>
170
+ Extract all images from a URL
171
+
172
+
173
+ ===== Options
174
+ ===== -t|--type arg
175
+
176
+ Type of images to return (img, srcset, opengraph, all)
177
+
178
+ [Default Value] ["all"]
179
+
180
+
181
+ ===== -c|--[no-]compressed
182
+ Expect compressed results
183
+
184
+
185
+
186
+ ===== --[no-]clean
187
+ Remove extra whitespace from results
188
+
189
+
190
+
191
+ ==== Command: <tt>json URL...</tt>
192
+ Get a JSON response from a URL, multiple URLs allowed
193
+
194
+
195
+ ===== Options
196
+ ===== -h|--header arg
197
+
198
+ Define a header to send as key=value
199
+
200
+ [Default Value] None
201
+
202
+
203
+ ===== -q|--query|--filter arg
204
+
205
+ Filter output using dot-syntax path
206
+
207
+ [Default Value] None
208
+
209
+
210
+ ===== -c|--[no-]compressed
211
+ Expect compressed results
212
+
213
+
214
+
215
+ ==== Command: <tt>links URL...</tt>
216
+ Return all links on a URL's page
217
+
218
+
219
+ ===== Options
220
+ ===== -q|--query|--filter arg
221
+
222
+ Filter output using dot-syntax path
223
+
224
+ [Default Value] None
225
+
226
+
227
+ ===== -d|--[no-]dedup
228
+ Filter out duplicate links, preserving only first one
229
+
230
+
231
+
232
+ ===== --[no-]ignore_fragments
233
+ Ignore fragment hrefs when gathering content links
234
+
235
+
236
+
237
+ ===== --[no-]ignore_relative
238
+ Ignore relative hrefs when gathering content links
239
+
240
+
241
+
242
+ ===== -x|--external_links_only
243
+ Only gather external links
244
+
245
+
246
+
247
+ ==== Command: <tt>scrape URL...</tt>
248
+ Scrape a page using a web browser, for dynamic (JS) pages. Be sure to have the selected --browser installed.
249
+
250
+
251
+ ===== Options
252
+ ===== -b|--browser arg
253
+
254
+ Browser to use (firefox, chrome)
255
+
256
+ [Default Value] None
257
+
258
+
259
+ ===== -h|--header arg
260
+
261
+ Define a header to send as "key=value"
262
+
263
+ [Default Value] None
264
+
265
+
266
+ ===== -q|--query|--filter arg
267
+
268
+ Filter output using dot-syntax path
269
+
270
+ [Default Value] None
271
+
272
+
273
+ ===== -r|--raw arg
274
+
275
+ Output a raw value for a key
276
+
277
+ [Default Value] None
278
+
279
+
280
+ ===== --search arg
281
+
282
+ Regurn an array of matches to a CSS or XPath query
283
+
284
+ [Default Value] None
285
+
286
+
287
+ ===== --[no-]clean
288
+ Remove extra whitespace from results
289
+
290
+
291
+
292
+ ==== Command: <tt>screenshot URL...</tt>
293
+ Save a screenshot of the URL
294
+
295
+
296
+ ===== Options
297
+ ===== -b|--browser arg
298
+
299
+ Browser to use (firefox, chrome)
300
+
301
+ [Default Value] chrome
302
+ [Must Match] (?-mix:^[fc].*?$)
303
+
304
+
305
+ ===== -o|--out|--file arg
306
+
307
+ File destination
308
+
309
+ [Default Value] None
310
+
311
+
312
+ ===== -t|--type arg
313
+
314
+ Type of screenshot to save (full (requires firefox), print, visible)
315
+
316
+ [Default Value] full
317
+ [Must Match] (?-mix:^[fpv].*?$)
318
+
319
+
320
+ ==== Command: <tt>tags URL...</tt>
321
+ Extract all instances of a tag
322
+
323
+
324
+ ===== Options
325
+ ===== -h|--header arg
326
+
327
+ Define a header to send as key=value
328
+
329
+ [Default Value] None
330
+
331
+
332
+ ===== -q|--query|--search arg
333
+
334
+ CSS/XPath query
335
+
336
+ [Default Value] None
337
+
338
+
339
+ ===== -t|--tag arg
340
+
341
+ Specify a tag to collect
342
+
343
+ [Default Value] None
344
+
345
+
346
+ ===== -c|--[no-]compressed
347
+ Expect compressed results
348
+
349
+
350
+
351
+ ===== --[no-]clean
352
+ Remove extra whitespace from results
353
+
354
+
355
+
@@ -0,0 +1,134 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Array helpers
4
+ class ::Array
5
+ ##
6
+ ## Remove extra spaces from each element of an array of
7
+ ## strings
8
+ ##
9
+ ## @return [Array] cleaned array
10
+ ##
11
+ def clean
12
+ map(&:clean)
13
+ end
14
+
15
+ ##
16
+ ## @see #clean
17
+ ##
18
+ def clean!
19
+ replace clean
20
+ end
21
+
22
+ ##
23
+ ## Strip HTML tags from each element of an array of
24
+ ## strings
25
+ ##
26
+ ## @return [Array] array of strings with HTML tags removed
27
+ ##
28
+ def strip_tags
29
+ map(&:strip_tags)
30
+ end
31
+
32
+ ##
33
+ ## Destructive version of #strip_tags
34
+ ##
35
+ ## @see #strip_tags
36
+ ##
37
+ def strip_tags!
38
+ replace strip_tags
39
+ end
40
+
41
+ ##
42
+ ## Remove duplicate links from an array of link objects
43
+ ##
44
+ ## @return [Array] deduped array of link objects
45
+ ##
46
+ def dedup_links
47
+ used = []
48
+ good = []
49
+ each do |link|
50
+ href = link[:href].sub(%r{/$}, '')
51
+ next if used.include?(href)
52
+
53
+ used.push(href)
54
+ good.push(link)
55
+ end
56
+
57
+ good
58
+ end
59
+
60
+ ##
61
+ ## Destructive version of #dedup_links
62
+ ##
63
+ ## @see #dedup_links
64
+ ##
65
+ def dedup_links!
66
+ replace dedup_links
67
+ end
68
+
69
+ ##
70
+ ## Convert and execute a dot-syntax query on the array
71
+ ##
72
+ ## @param path [String] The dot-syntax path
73
+ ##
74
+ ## @return [Array] Matching elements
75
+ ##
76
+ def dot_query(path)
77
+ output = []
78
+ if path =~ /^\[([\d+.])\]\.?/
79
+ int = Regexp.last_match(1)
80
+ path.sub!(/^\[[\d.]+\]\.?/, '')
81
+ items = self[eval(int)]
82
+ else
83
+ items = self
84
+ end
85
+
86
+ if items.is_a? Hash
87
+ output = items.dot_query(path)
88
+ else
89
+ items.each do |item|
90
+ res = item.is_a?(Hash) ? item.stringify_keys : item
91
+ out = []
92
+ q = path.split(/(?<![\d.])\./)
93
+ q.each do |pth|
94
+ el = Regexp.last_match(1) if pth =~ /\[([0-9,.]+)\]/
95
+ pth.sub!(/\[([0-9,.]+)\]/, '')
96
+ ats = []
97
+ at = []
98
+ while pth =~ /\[[+&,]?\w+ *[\^*$=<>]=? *\w+/
99
+ m = pth.match(/\[(?<com>[,+&])? *(?<key>\w+) *(?<op>[\^*$=<>]{1,2}) *(?<val>\w+) */)
100
+ comp = [m['key'], m['op'], m['val']]
101
+ case m['com']
102
+ when ','
103
+ ats.push(comp)
104
+ at = []
105
+ else
106
+ at.push(comp)
107
+ end
108
+
109
+ pth.sub!(/\[(?<com>[,&+])? *(?<key>\w+) *(?<op>[\^*$=<>]{1,2}) *(?<val>\w+)/, '[')
110
+ end
111
+ ats.push(at) unless at.empty?
112
+ pth.sub!(/\[\]/, '')
113
+
114
+ return false if el.nil? && ats.empty? && !res.key?(pth)
115
+
116
+ res = res[pth] unless pth.empty?
117
+
118
+ while ats.count.positive?
119
+ atr = ats.shift
120
+
121
+ keepers = res.filter do |r|
122
+ evaluate_comp(r, atr)
123
+ end
124
+ out.concat(keepers)
125
+ end
126
+
127
+ out = out[eval(el)] if out.is_a?(Array) && el =~ /^[\d.,]+$/
128
+ end
129
+ output.push(out)
130
+ end
131
+ end
132
+ output
133
+ end
134
+ end