sawzall 0.1.0.pre2-aarch64-linux → 0.1.0.pre3-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +18 -15
- data/lib/sawzall/3.2/sawzall.so +0 -0
- data/lib/sawzall/3.4/sawzall.so +0 -0
- data/lib/sawzall/version.rb +1 -1
- data/lib/sawzall.rb +182 -108
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f1085b4be4dd7a81804eff99959ba07f8acff313264488aad68b79fd0b6dd04c
|
4
|
+
data.tar.gz: a5aecb0e25217d34ae57d6e455ce128f12e56c608e26a9a1ad79366e277c5c7d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3307ba3347839cb16d5671866657023b1f43958a3eaf825e31a57b4a7f9af67ce005b9df0adeff96b5afdf8567ecfab0d2bbf201b9486efc59f5ecc7a942442f
|
7
|
+
data.tar.gz: da0b844b2480850403e72e1b0e06dda039fbe807b811b22134f44b5ac5cf6cc6b2091ba73f3ff8ae18f15dc6b5c92c2a37060ed648ec4910bc87f4b08d95ff08
|
data/README.md
CHANGED
@@ -1,38 +1,41 @@
|
|
1
|
-
# Sawzall
|
1
|
+
# Sawzall 🪚
|
2
2
|
|
3
|
-
|
3
|
+
Sawzall wraps the Rust scraper library (https://github.com/rust-scraper/scraper) to make it easy to parse HTML documents and query them with CSS selectors.
|
4
4
|
|
5
|
-
|
5
|
+
```ruby
|
6
|
+
require "sawzall"
|
7
|
+
require "net/http"
|
6
8
|
|
7
|
-
|
9
|
+
doc = Sawzall.parse_document(Net::HTTP.get("example.org", "/"))
|
10
|
+
doc.select("title").first.text #=> "Example Domain"
|
11
|
+
```
|
8
12
|
|
9
|
-
|
13
|
+
> [!NOTE]
|
14
|
+
> Sawzall is a hobby project. Expect ongoing development and maintenance to be very much correlated to how much value it brings me as a learning resource and as a tool for my other projects.
|
15
|
+
>
|
16
|
+
> You are welcome to report bugs you run into or submit pull requests for changes that would make it more useful for your use-case, but please bear the above in mind.
|
17
|
+
|
18
|
+
## Installation
|
10
19
|
|
11
20
|
Install the gem and add to the application's Gemfile by executing:
|
12
21
|
|
13
22
|
```bash
|
14
|
-
bundle add
|
23
|
+
bundle add sawzall
|
15
24
|
```
|
16
25
|
|
17
26
|
If bundler is not being used to manage dependencies, install the gem by executing:
|
18
27
|
|
19
28
|
```bash
|
20
|
-
gem install
|
29
|
+
gem install sawzall
|
21
30
|
```
|
22
31
|
|
23
32
|
## Usage
|
24
33
|
|
25
|
-
|
26
|
-
|
27
|
-
## Development
|
28
|
-
|
29
|
-
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
30
|
-
|
31
|
-
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
34
|
+
[API documentation](https://davidcornu.github.io/sawzall/)
|
32
35
|
|
33
36
|
## Contributing
|
34
37
|
|
35
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
38
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/davidcornu/sawzall.
|
36
39
|
|
37
40
|
## License
|
38
41
|
|
data/lib/sawzall/3.2/sawzall.so
CHANGED
Binary file
|
data/lib/sawzall/3.4/sawzall.so
CHANGED
Binary file
|
data/lib/sawzall/version.rb
CHANGED
data/lib/sawzall.rb
CHANGED
@@ -73,112 +73,186 @@ module Sawzall
|
|
73
73
|
# # @return [Sawzall::Element]
|
74
74
|
# end
|
75
75
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
76
|
+
class Element
|
77
|
+
# @!group 1) Querying
|
78
|
+
|
79
|
+
# Returns the element's name in lowercase
|
80
|
+
#
|
81
|
+
# @example
|
82
|
+
# doc = Sawzall.parse_fragment("<p>Paragraph</p>")
|
83
|
+
# doc.select("p").first.name #=> "p"
|
84
|
+
#
|
85
|
+
# @!method name
|
86
|
+
# @return [String]
|
87
|
+
|
88
|
+
# Returns the element's outer HTML
|
89
|
+
#
|
90
|
+
# @example
|
91
|
+
# doc = Sawzall.parse_fragment(<<~HTML)
|
92
|
+
# <section>
|
93
|
+
# <h1>Heading</h1>
|
94
|
+
# </section>
|
95
|
+
# HTML
|
96
|
+
# section = doc.select("section").first
|
97
|
+
# section.html #=> "<section>\n<h1>Heading</h1>\n</section>"
|
98
|
+
#
|
99
|
+
# @!method html
|
100
|
+
# @return [String]
|
101
|
+
|
102
|
+
# Returns the element's inner HTML
|
103
|
+
#
|
104
|
+
# @example
|
105
|
+
# doc = Sawzall.parse_fragment(<<~HTML)
|
106
|
+
# <section>
|
107
|
+
# <h1>Heading</h1>
|
108
|
+
# </section>
|
109
|
+
# HTML
|
110
|
+
# section = doc.select("section").first
|
111
|
+
# section.inner_html #=> "\n<h1>Heading</h1>\n"
|
112
|
+
#
|
113
|
+
# @!method inner_html
|
114
|
+
# @return [String]
|
115
|
+
|
116
|
+
# Returns the given attribute's value or `nil`
|
117
|
+
#
|
118
|
+
# @example
|
119
|
+
# doc = Sawzall.parse_fragment("<h1 id='title'>Heading</h1>")
|
120
|
+
# h1 = doc.select("h1").first
|
121
|
+
# h1.attr("id") #=> "title"
|
122
|
+
# h1.attr("class") #=> nil
|
123
|
+
#
|
124
|
+
# @!method attr(attribute)
|
125
|
+
# @param attribute [String]
|
126
|
+
# @return [String, Nil]
|
127
|
+
|
128
|
+
# Returns the element's attributes as an array of key-value pairs
|
129
|
+
#
|
130
|
+
# @example
|
131
|
+
# doc = Sawzall.parse_fragment("<h1 id='title' class='big'>Heading</h1>")
|
132
|
+
# h1 = doc.select("h1").first
|
133
|
+
# h1.attrs #=> [["class", "big"], ["id", "title"]]
|
134
|
+
#
|
135
|
+
# @!method attrs
|
136
|
+
# @return [Array<Array(String, String)>]
|
137
|
+
|
138
|
+
# Returns the child elements that match the given CSS selector
|
139
|
+
#
|
140
|
+
# https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_selectors
|
141
|
+
#
|
142
|
+
# @example
|
143
|
+
# doc = Sawzall.parse_fragment(<<~HTML)
|
144
|
+
# <div class="container">
|
145
|
+
# <div>inner div 1</div>
|
146
|
+
# <div>inner div 2</div>
|
147
|
+
# </div>
|
148
|
+
# HTML
|
149
|
+
# container = doc.select(".container").first
|
150
|
+
# matches = container.select("div")
|
151
|
+
# matches.map(&:text) #=> ["inner div 1", "inner div 2"]
|
152
|
+
#
|
153
|
+
# @!method select(css_selector)
|
154
|
+
# @param css_selector [String]
|
155
|
+
# @raise [ArgumentError] if the CSS selector is invalid
|
156
|
+
# @return [Array<Sawzall::Element>]
|
157
|
+
|
158
|
+
# Returns the element's child elements
|
159
|
+
#
|
160
|
+
# @example
|
161
|
+
# doc = Sawzall.parse_fragment(<<~HTML)
|
162
|
+
# <div id="parent">
|
163
|
+
# <div id="child1">
|
164
|
+
# <div id="grandchild1"></div>
|
165
|
+
# </div>
|
166
|
+
# <div id="child2"></div>
|
167
|
+
# </div>
|
168
|
+
# HTML
|
169
|
+
# parent = doc.select("#parent").first
|
170
|
+
# parent
|
171
|
+
# .child_elements
|
172
|
+
# .map { it.attr("id") } #=> ["child1", "child2"]
|
173
|
+
#
|
174
|
+
# @!method child_elements
|
175
|
+
# @return [Array<Sawzall::Element>]
|
176
|
+
|
177
|
+
# Returns the element's text content using a very simplified version of the
|
178
|
+
# `innerText` algorithm.
|
179
|
+
#
|
180
|
+
# https://developer.mozilla.org/en-US/docs/Web/API/HTMLElement/innerText
|
181
|
+
#
|
182
|
+
# @example
|
183
|
+
# doc = Sawzall.parse_fragment(<<~HTML)
|
184
|
+
# <ul>
|
185
|
+
# <li>First item</li>
|
186
|
+
# <li>Second item</li>
|
187
|
+
# </ul>
|
188
|
+
# HTML
|
189
|
+
# ul = doc.select("ul").first
|
190
|
+
# ul.text #=> "First item\nSecond item"
|
191
|
+
#
|
192
|
+
# @!method text
|
193
|
+
# @return [String]
|
194
|
+
|
195
|
+
# Checks whether the element has the given class
|
196
|
+
#
|
197
|
+
# @example
|
198
|
+
# doc = Sawzall.parse_fragment("<h1 class='title'>Heading</h1>")
|
199
|
+
# h1 = doc.select("h1").first
|
200
|
+
# h1.has_class?("title") #=> true
|
201
|
+
# h1.has_class?("TITLE", case_sensitive: false) #=> true
|
202
|
+
# h1.has_class?("heading") #=> false
|
203
|
+
#
|
204
|
+
# @!method has_class?(css_class, case_sensitive: true)
|
205
|
+
# @param css_class [String]
|
206
|
+
# @param case_sensitive [Boolean]
|
207
|
+
# Whether matching should be case sensitive. When `false`, only ASCII characters are matched case-insensitively.
|
208
|
+
# @return [Boolean]
|
209
|
+
|
210
|
+
# Returns the element's classes
|
211
|
+
#
|
212
|
+
# @example
|
213
|
+
# doc = Sawzall.parse_fragment("<h1 class='one two'>Heading</h1>")
|
214
|
+
# h1 = doc.select("h1").first
|
215
|
+
# h1.classes #=> ["one", "two"]
|
216
|
+
#
|
217
|
+
# @!method classes
|
218
|
+
# @return [Array<String>]
|
219
|
+
|
220
|
+
# @!endgroup
|
221
|
+
|
222
|
+
# @!group 2) Debugging
|
223
|
+
|
224
|
+
# Overrides Ruby's default `Object#inspect` so the output is a bit more useful
|
225
|
+
def inspect
|
226
|
+
"<#{self.class.name} name=#{name.inspect} child_elements=#{child_elements.inspect}>"
|
227
|
+
end
|
228
|
+
|
229
|
+
# Provides a custom pretty-printing implementation for Ruby's `PP`
|
230
|
+
def pretty_print(pp)
|
231
|
+
pp.group(2, "#(#{self.class.name} {", "})") do
|
232
|
+
pp.breakable
|
233
|
+
|
234
|
+
fields = [:name]
|
235
|
+
fields << :child_elements unless child_elements.empty?
|
236
|
+
|
237
|
+
pp.seplist(fields) do |field|
|
238
|
+
case field
|
239
|
+
when :name
|
240
|
+
pp.text("name = ")
|
241
|
+
pp.pp(name)
|
242
|
+
when :child_elements
|
243
|
+
pp.group(2, "child_elements = [", "]") do
|
244
|
+
pp.breakable
|
245
|
+
pp.seplist(child_elements) do |child|
|
246
|
+
pp.pp(child)
|
247
|
+
end
|
248
|
+
end
|
249
|
+
end
|
250
|
+
end
|
251
|
+
|
252
|
+
pp.breakable
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
256
|
+
# @!endgroup
|
257
|
+
end
|
184
258
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sawzall
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.0.
|
4
|
+
version: 0.1.0.pre3
|
5
5
|
platform: aarch64-linux
|
6
6
|
authors:
|
7
7
|
- David Cornu
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-05-
|
11
|
+
date: 2025-05-16 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: |
|
14
14
|
Sawzall wraps the Rust scraper library (https://github.com/rust-scraper/scraper)
|