curlyq 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/CHANGELOG.md +8 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +41 -0
- data/LICENSE.txt +19 -0
- data/README.md +233 -0
- data/README.rdoc +6 -0
- data/Rakefile +77 -0
- data/bin/curlyq +477 -0
- data/curlyq.gemspec +27 -0
- data/curlyq.rdoc +355 -0
- data/lib/curly/array.rb +134 -0
- data/lib/curly/curl/html.rb +720 -0
- data/lib/curly/curl/json.rb +108 -0
- data/lib/curly/curl.rb +7 -0
- data/lib/curly/hash.rb +200 -0
- data/lib/curly/string.rb +91 -0
- data/lib/curly/version.rb +3 -0
- data/lib/curly.rb +12 -0
- data/src/_README.md +101 -0
- data/test/default_test.rb +14 -0
- data/test/test_helper.rb +4 -0
- metadata +191 -0
@@ -0,0 +1,108 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Curl
|
4
|
+
# Class for CURLing a JSON response
|
5
|
+
class Json
|
6
|
+
attr_reader :url, :code, :json, :headers
|
7
|
+
|
8
|
+
def to_data
|
9
|
+
{
|
10
|
+
url: @url,
|
11
|
+
code: @code,
|
12
|
+
json: @json,
|
13
|
+
headers: @headers
|
14
|
+
}
|
15
|
+
end
|
16
|
+
|
17
|
+
##
|
18
|
+
## Create a new Curl::Json page object
|
19
|
+
##
|
20
|
+
## @param url [String] The url to curl
|
21
|
+
## @param headers [Hash] The headers to send
|
22
|
+
## @param compressed [Boolean] Expect compressed results
|
23
|
+
##
|
24
|
+
## @return [Curl::Json] Curl::Json object with url, code, parsed json, and response headers
|
25
|
+
##
|
26
|
+
def initialize(url, headers: nil, compressed: false, symbolize_names: false)
|
27
|
+
@curl = TTY::Which.which('curl')
|
28
|
+
page = curl_json(url, headers: headers, compressed: compressed, symbolize_names: symbolize_names)
|
29
|
+
|
30
|
+
raise "Error retrieving #{url}" if page.nil? || page.empty?
|
31
|
+
|
32
|
+
@url = page[:url]
|
33
|
+
@code = page[:code]
|
34
|
+
@json = page[:json]
|
35
|
+
@headers = page[:headers]
|
36
|
+
end
|
37
|
+
|
38
|
+
def path(path, json = @json)
|
39
|
+
parts = path.split(/./)
|
40
|
+
target = json
|
41
|
+
parts.each do |part|
|
42
|
+
if part =~ /(?<key>[^\[]+)\[(?<int>\d+)\]/
|
43
|
+
target = target[key][int.to_i]
|
44
|
+
else
|
45
|
+
target = target[part]
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
target
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
##
|
55
|
+
## Curl the JSON contents
|
56
|
+
##
|
57
|
+
## @param url [String] The url
|
58
|
+
## @param headers [Hash] The headers to send
|
59
|
+
## @param compressed [Boolean] Expect compressed results
|
60
|
+
##
|
61
|
+
## @return [Hash] hash of url, code, headers, and parsed json
|
62
|
+
##
|
63
|
+
def curl_json(url, headers: nil, compressed: false, symbolize_names: false)
|
64
|
+
flags = 'SsLi'
|
65
|
+
agents = [
|
66
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Safari/605.1.1',
|
67
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.',
|
68
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.3',
|
69
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.'
|
70
|
+
]
|
71
|
+
|
72
|
+
headers = headers.nil? ? '' : headers.map { |h, v| %(-H "#{h}: #{v}") }.join(' ')
|
73
|
+
compress = compressed ? '--compressed' : ''
|
74
|
+
source = `#{@curl} -#{flags} #{compress} #{headers} '#{url}' 2>/dev/null`
|
75
|
+
agent = 0
|
76
|
+
while source.nil? || source.empty?
|
77
|
+
source = `#{@curl} -#{flags} #{compress} -A "#{agents[agent]}" #{headers} '#{url}' 2>/dev/null`
|
78
|
+
break if agent >= agents.count - 1
|
79
|
+
end
|
80
|
+
|
81
|
+
return false if source.nil? || source.empty?
|
82
|
+
|
83
|
+
source.strip!
|
84
|
+
|
85
|
+
headers = {}
|
86
|
+
lines = source.split(/\r\n/)
|
87
|
+
code = lines[0].match(/(\d\d\d)/)[1]
|
88
|
+
lines.shift
|
89
|
+
lines.each_with_index do |line, idx|
|
90
|
+
if line =~ /^([\w-]+): (.*?)$/
|
91
|
+
m = Regexp.last_match
|
92
|
+
headers[m[1]] = m[2]
|
93
|
+
else
|
94
|
+
source = lines[idx..].join("\n")
|
95
|
+
break
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
json = source.strip.force_encoding('utf-8')
|
100
|
+
begin
|
101
|
+
json.gsub!(/[\u{1F600}-\u{1F6FF}]/, '')
|
102
|
+
{ url: url, code: code, headers: headers, json: JSON.parse(json, symbolize_names: symbolize_names) }
|
103
|
+
rescue StandardError => e
|
104
|
+
{ url: url, code: code, headers: headers, json: nil}
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
data/lib/curly/curl.rb
ADDED
data/lib/curly/hash.rb
ADDED
@@ -0,0 +1,200 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Hash helpers
|
4
|
+
class ::Hash
|
5
|
+
# Extract data using a dot-syntax path
|
6
|
+
#
|
7
|
+
# @param path [String] The path
|
8
|
+
#
|
9
|
+
# @return Result of path query
|
10
|
+
#
|
11
|
+
def dot_query(path)
|
12
|
+
res = stringify_keys
|
13
|
+
out = []
|
14
|
+
q = path.split(/(?<![\d.])\./)
|
15
|
+
q.each do |pth|
|
16
|
+
el = Regexp.last_match(1) if pth =~ /\[([0-9,.]+)\]/
|
17
|
+
pth.sub!(/\[([0-9,.]+)\]/, '')
|
18
|
+
ats = []
|
19
|
+
at = []
|
20
|
+
while pth =~ /\[[+&,]?\w+ *[\^*$=<>]=? *\w+/
|
21
|
+
m = pth.match(/\[(?<com>[,+&])? *(?<key>\w+) *(?<op>[\^*$=<>]{1,2}) *(?<val>\w+) */)
|
22
|
+
comp = [m['key'], m['op'], m['val']]
|
23
|
+
case m['com']
|
24
|
+
when ','
|
25
|
+
ats.push(comp)
|
26
|
+
at = []
|
27
|
+
else
|
28
|
+
at.push(comp)
|
29
|
+
end
|
30
|
+
|
31
|
+
pth.sub!(/\[(?<com>[,&+])? *(?<key>\w+) *(?<op>[\^*$=<>]{1,2}) *(?<val>\w+)/, '[')
|
32
|
+
end
|
33
|
+
ats.push(at) unless at.empty?
|
34
|
+
pth.sub!(/\[\]/, '')
|
35
|
+
|
36
|
+
return false if el.nil? && ats.empty? && !res.key?(pth)
|
37
|
+
|
38
|
+
res = res[pth] unless pth.empty?
|
39
|
+
|
40
|
+
if ats.count.positive?
|
41
|
+
while ats.count.positive?
|
42
|
+
atr = ats.shift
|
43
|
+
|
44
|
+
keepers = res.filter do |r|
|
45
|
+
evaluate_comp(r, atr)
|
46
|
+
end
|
47
|
+
out.concat(keepers)
|
48
|
+
end
|
49
|
+
else
|
50
|
+
out = res
|
51
|
+
end
|
52
|
+
|
53
|
+
out = out[eval(el)] if out.is_a?(Array) && el =~ /^[\d.,]+$/
|
54
|
+
end
|
55
|
+
out
|
56
|
+
end
|
57
|
+
|
58
|
+
##
|
59
|
+
## Evaluate a comparison
|
60
|
+
##
|
61
|
+
## @param r [Hash] hash of source elements and
|
62
|
+
## comparison operators
|
63
|
+
## @param atr [String] The attribute to compare
|
64
|
+
##
|
65
|
+
## @return [Boolean] whether the comparison passes or fails
|
66
|
+
##
|
67
|
+
def evaluate_comp(r, atr)
|
68
|
+
keep = true
|
69
|
+
|
70
|
+
atr.each do |a|
|
71
|
+
key = a[0].to_sym
|
72
|
+
val = if a[2] =~ /^\d+$/
|
73
|
+
a[2].to_i
|
74
|
+
elsif a[2] =~ /^\d+\.\d+$/
|
75
|
+
a[2].to_f
|
76
|
+
else
|
77
|
+
a[2]
|
78
|
+
end
|
79
|
+
|
80
|
+
if !r.key?(key)
|
81
|
+
keep = false
|
82
|
+
elsif r[key].is_a?(Array)
|
83
|
+
valid = r[key].filter do |k|
|
84
|
+
case a[1]
|
85
|
+
when /^\^/
|
86
|
+
k =~ /^#{a[2]}/i ? true : false
|
87
|
+
when /^\$/
|
88
|
+
k =~ /#{a[2]}$/i ? true : false
|
89
|
+
when /^\*/
|
90
|
+
k =~ /#{a[2]}/i ? true : false
|
91
|
+
else
|
92
|
+
k =~ /^#{a[2]}$/i ? true : false
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
keep = valid.count.positive?
|
97
|
+
elsif val.is_a?(Numeric) && a[1] =~ /^[<>=]{1,2}$/
|
98
|
+
k = r[key].to_i
|
99
|
+
comp = a[1] =~ /^=$/ ? '==' : a[1]
|
100
|
+
keep = eval("#{k}#{comp}#{val}")
|
101
|
+
else
|
102
|
+
keep = case a[1]
|
103
|
+
when /^\^/
|
104
|
+
r[key] =~ /^#{a[2]}/i ? true : false
|
105
|
+
when /^\$/
|
106
|
+
r[key] =~ /#{a[2]}$/i ? true : false
|
107
|
+
when /^\*/
|
108
|
+
r[key] =~ /#{a[2]}/i ? true : false
|
109
|
+
else
|
110
|
+
r[key] =~ /^#{a[2]}$/i ? true : false
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
keep
|
116
|
+
end
|
117
|
+
|
118
|
+
##
|
119
|
+
## Test if a hash contains a tag matching filter queries
|
120
|
+
##
|
121
|
+
## @param tag_name [String] The tag name
|
122
|
+
## @param classes [String] The classes to match
|
123
|
+
## @param id [String] The id attribute to
|
124
|
+
## match
|
125
|
+
## @param attribute [String] The attribute
|
126
|
+
## @param operator [String] The operator, <>= *=
|
127
|
+
## $= ^=
|
128
|
+
## @param value [String] The value to match
|
129
|
+
## @param descendant [Boolean] Check descendant tags
|
130
|
+
##
|
131
|
+
def tag_match(tag_name, classes, id, attribute, operator, value, descendant: false)
|
132
|
+
tag = self
|
133
|
+
keep = true
|
134
|
+
|
135
|
+
keep = false if tag_name && !tag['tag'] =~ /^#{tag_name}$/i
|
136
|
+
|
137
|
+
if tag.key?('attrs') && tag['attrs']
|
138
|
+
if keep && id
|
139
|
+
tag_id = tag['attrs'].filter { |a| a['key'] == 'id' }.first['value']
|
140
|
+
keep = tag_id && tag_id =~ /#{id}/i
|
141
|
+
end
|
142
|
+
|
143
|
+
if keep && classes
|
144
|
+
cls = tag['attrs'].filter { |a| a['key'] == 'class' }.first
|
145
|
+
if cls
|
146
|
+
all = true
|
147
|
+
classes.each { |c| all = cls['value'].include?(c) }
|
148
|
+
keep = all
|
149
|
+
else
|
150
|
+
keep = false
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
if keep && attribute
|
155
|
+
attributes = tag['attrs'].filter { |a| a['key'] =~ /^#{attribute}$/i }
|
156
|
+
any = false
|
157
|
+
attributes.each do |a|
|
158
|
+
break if any
|
159
|
+
|
160
|
+
any = case operator
|
161
|
+
when /^*/
|
162
|
+
a['value'] =~ /#{value}/i
|
163
|
+
when /^\^/
|
164
|
+
a['value'] =~ /^#{value}/i
|
165
|
+
when /^\$/
|
166
|
+
a['value'] =~ /#{value}$/i
|
167
|
+
else
|
168
|
+
a['value'] =~ /^#{value}$/i
|
169
|
+
end
|
170
|
+
end
|
171
|
+
keep = any
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
return false if descendant && !keep
|
176
|
+
|
177
|
+
if !descendant && tag.key?('tags')
|
178
|
+
tags = tag['tags'].filter { |t| t.tag_match(tag_name, classes, id, attribute, operator, value) }
|
179
|
+
tags.count.positive?
|
180
|
+
else
|
181
|
+
keep
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
# Turn all keys into string
|
186
|
+
#
|
187
|
+
# If the hash has both a string and a symbol for key,
|
188
|
+
# keep the string value, discarding the symnbol value
|
189
|
+
#
|
190
|
+
# @return [Hash] a copy of the hash where all its
|
191
|
+
# keys are strings
|
192
|
+
#
|
193
|
+
def stringify_keys
|
194
|
+
each_with_object({}) do |(k, v), hsh|
|
195
|
+
next if k.is_a?(Symbol) && key?(k.to_s)
|
196
|
+
|
197
|
+
hsh[k.to_s] = v.is_a?(Hash) ? v.stringify_keys : v
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
data/lib/curly/string.rb
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
##
|
4
|
+
## Remove extra spaces and newlines from a string
|
5
|
+
##
|
6
|
+
## @return [String] cleaned string
|
7
|
+
##
|
8
|
+
class ::String
|
9
|
+
def clean
|
10
|
+
gsub(/[\n ]+/m, ' ').gsub(/> +</, '><')
|
11
|
+
end
|
12
|
+
|
13
|
+
##
|
14
|
+
## Remove HTML tags from a string
|
15
|
+
##
|
16
|
+
## @return [String] stripped string
|
17
|
+
##
|
18
|
+
def strip_tags
|
19
|
+
gsub(%r{</?.*?>}, '')
|
20
|
+
end
|
21
|
+
|
22
|
+
##
|
23
|
+
## Destructive version of #clean
|
24
|
+
##
|
25
|
+
## @see #clean
|
26
|
+
##
|
27
|
+
def clean!
|
28
|
+
replace clean
|
29
|
+
end
|
30
|
+
|
31
|
+
##
|
32
|
+
## Destructive version of #strip_tags
|
33
|
+
##
|
34
|
+
## @see #strip_tags
|
35
|
+
##
|
36
|
+
def strip_tags!
|
37
|
+
replace strip_tags
|
38
|
+
end
|
39
|
+
|
40
|
+
##
|
41
|
+
## Convert an image type string to a symbol
|
42
|
+
##
|
43
|
+
## @return Symbol :srcset, :img, :opengraph, :all
|
44
|
+
##
|
45
|
+
def normalize_image_type(default = :all)
|
46
|
+
case self.to_s
|
47
|
+
when /^[sp]/i
|
48
|
+
:srcset
|
49
|
+
when /^i/i
|
50
|
+
:img
|
51
|
+
when /^o/i
|
52
|
+
:opengraph
|
53
|
+
else
|
54
|
+
default.is_a?(Symbol) ? default.to_sym : default.normalize_image_type
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
##
|
59
|
+
## Convert a browser type string to a symbol
|
60
|
+
##
|
61
|
+
## @return Symbol :chrome, :firefox
|
62
|
+
##
|
63
|
+
def normalize_browser_type(default = :none)
|
64
|
+
case self.to_s
|
65
|
+
when /^c/i
|
66
|
+
:chrome
|
67
|
+
when /^f/i
|
68
|
+
:firefox
|
69
|
+
else
|
70
|
+
default.is_a?(Symbol) ? default.to_sym : default.normalize_browser_type
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
##
|
75
|
+
## Convert a screenshot type string to a symbol
|
76
|
+
##
|
77
|
+
## @return Symbol :full_page, :print_page, :visible
|
78
|
+
##
|
79
|
+
def normalize_screenshot_type(default = :none)
|
80
|
+
case self.to_s
|
81
|
+
when /^f/i
|
82
|
+
:full_page
|
83
|
+
when /^p/i
|
84
|
+
:print_page
|
85
|
+
when /^v/i
|
86
|
+
:visible
|
87
|
+
else
|
88
|
+
default.is_a?(Symbol) ? default.to_sym : default.normalize_browser_type
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
data/lib/curly.rb
ADDED
data/src/_README.md
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
<!--README--><!--GITHUB--># curlyq
|
2
|
+
|
3
|
+
[![Gem](https://img.shields.io/gem/v/na.svg)](https://rubygems.org/gems/curlyq)
|
4
|
+
[![GitHub license](https://img.shields.io/github/license/ttscoff/curlyq.svg)](./LICENSE.txt)
|
5
|
+
|
6
|
+
**A command line helper for curl and web scraping**
|
7
|
+
|
8
|
+
_If you find this useful, feel free to [buy me some coffee][donate]._
|
9
|
+
<!--END GITHUB-->
|
10
|
+
|
11
|
+
The current version of `curlyq` is <!--VER--><!--END VER-->.
|
12
|
+
|
13
|
+
`curlyq` is a command that provides a simple interface for curl, with additional features for things like extracting images and links, finding elements by CSS selector or XPath, getting detailed header info, and more. It also has rudimentary support for making calls to JSON endpoints easier, but it's expected that you'll use something like `jq` to parse the output.
|
14
|
+
|
15
|
+
[github]: https://github.com/ttscoff/curlyq/
|
16
|
+
|
17
|
+
### Installation
|
18
|
+
|
19
|
+
Assuming you have Ruby and RubyGems installed, you can just run `gem install curlyq`. If you run into errors, try `gem install --user-install curlyq`, or use `sudo gem install curlyq`.
|
20
|
+
|
21
|
+
If you're using Homebrew, you have the option to install via [brew-gem](https://github.com/sportngin/brew-gem):
|
22
|
+
|
23
|
+
brew install brew-gem
|
24
|
+
brew gem install curlyq
|
25
|
+
|
26
|
+
If you don't have Ruby/RubyGems, you can install them pretty easily with Homebrew, rvm, or asdf.
|
27
|
+
|
28
|
+
### Usage
|
29
|
+
|
30
|
+
Run `curlyq help` for a list of commands. Run `curlyq help SUBCOMMAND` for details on a particular subcommand and its options.
|
31
|
+
|
32
|
+
```
|
33
|
+
@cli(bundle exec bin/curlyq help)
|
34
|
+
```
|
35
|
+
|
36
|
+
#### Commands
|
37
|
+
|
38
|
+
curlyq makes use of subcommands, e.g. `curlyq html` or `curlyq extract`. Each subcommand takes its own options, but I've made an effort to standardize the choices between each command.
|
39
|
+
|
40
|
+
##### extract
|
41
|
+
|
42
|
+
```
|
43
|
+
@cli(bundle exec bin/curlyq help extract)
|
44
|
+
```
|
45
|
+
|
46
|
+
|
47
|
+
##### headlinks
|
48
|
+
|
49
|
+
```
|
50
|
+
@cli(bundle exec bin/curlyq help headlinks)
|
51
|
+
```
|
52
|
+
|
53
|
+
##### html
|
54
|
+
|
55
|
+
```
|
56
|
+
@cli(bundle exec bin/curlyq help html)
|
57
|
+
```
|
58
|
+
|
59
|
+
##### images
|
60
|
+
|
61
|
+
```
|
62
|
+
@cli(bundle exec bin/curlyq help images)
|
63
|
+
```
|
64
|
+
|
65
|
+
##### json
|
66
|
+
|
67
|
+
```
|
68
|
+
@cli(bundle exec bin/curlyq help json)
|
69
|
+
```
|
70
|
+
|
71
|
+
##### links
|
72
|
+
|
73
|
+
```
|
74
|
+
@cli(bundle exec bin/curlyq help links)
|
75
|
+
```
|
76
|
+
|
77
|
+
##### scrape
|
78
|
+
|
79
|
+
```
|
80
|
+
@cli(bundle exec bin/curlyq help scrape)
|
81
|
+
```
|
82
|
+
|
83
|
+
##### screenshot
|
84
|
+
|
85
|
+
```
|
86
|
+
@cli(bundle exec bin/curlyq help screenshot)
|
87
|
+
```
|
88
|
+
|
89
|
+
##### tags
|
90
|
+
|
91
|
+
```
|
92
|
+
@cli(bundle exec bin/curlyq help tags)
|
93
|
+
```
|
94
|
+
|
95
|
+
<!--GITHUB-->
|
96
|
+
PayPal link: [paypal.me/ttscoff](https://paypal.me/ttscoff)
|
97
|
+
|
98
|
+
## Changelog
|
99
|
+
|
100
|
+
See [CHANGELOG.md](https://github.com/ttscoff/na_gem/blob/master/CHANGELOG.md)
|
101
|
+
<!--END GITHUB--><!--END README-->
|
data/test/test_helper.rb
ADDED