scrapetor 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +242 -0
- data/LICENSE +21 -0
- data/README.md +440 -0
- data/bin/scrapetor +190 -0
- data/bin/scrapetor-bench +5 -0
- data/ext/scrapetor/README.md +53 -0
- data/ext/scrapetor/native/extconf.rb +67 -0
- data/ext/scrapetor/native/scrapetor_dom.c +6346 -0
- data/ext/scrapetor/native/scrapetor_http.c +2591 -0
- data/ext/scrapetor/native/scrapetor_native.c +1156 -0
- data/lib/scrapetor/builder.rb +158 -0
- data/lib/scrapetor/cleaner.rb +10 -0
- data/lib/scrapetor/comment_node.rb +67 -0
- data/lib/scrapetor/document.rb +457 -0
- data/lib/scrapetor/dom/parser.rb +69 -0
- data/lib/scrapetor/dom/selectors.rb +208 -0
- data/lib/scrapetor/dom.rb +563 -0
- data/lib/scrapetor/encoding.rb +85 -0
- data/lib/scrapetor/entities.rb +90 -0
- data/lib/scrapetor/errors.rb +12 -0
- data/lib/scrapetor/extractor.rb +147 -0
- data/lib/scrapetor/fetcher.rb +390 -0
- data/lib/scrapetor/fingerprint.rb +29 -0
- data/lib/scrapetor/form.rb +141 -0
- data/lib/scrapetor/http.rb +114 -0
- data/lib/scrapetor/microdata.rb +132 -0
- data/lib/scrapetor/money.rb +30 -0
- data/lib/scrapetor/native.rb +291 -0
- data/lib/scrapetor/native_dom.rb +2258 -0
- data/lib/scrapetor/node.rb +539 -0
- data/lib/scrapetor/node_set.rb +301 -0
- data/lib/scrapetor/page_type.rb +95 -0
- data/lib/scrapetor/pagination.rb +109 -0
- data/lib/scrapetor/persistent_cache.rb +130 -0
- data/lib/scrapetor/robots.rb +159 -0
- data/lib/scrapetor/sax.rb +285 -0
- data/lib/scrapetor/schema.rb +144 -0
- data/lib/scrapetor/selector.rb +576 -0
- data/lib/scrapetor/session.rb +141 -0
- data/lib/scrapetor/sitemap.rb +52 -0
- data/lib/scrapetor/stream.rb +111 -0
- data/lib/scrapetor/structured_data.rb +74 -0
- data/lib/scrapetor/template_registry.rb +24 -0
- data/lib/scrapetor/text_node.rb +101 -0
- data/lib/scrapetor/url.rb +21 -0
- data/lib/scrapetor/version.rb +5 -0
- data/lib/scrapetor/xpath.rb +1603 -0
- data/lib/scrapetor.rb +167 -0
- data/scrapetor.gemspec +77 -0
- metadata +200 -0
|
@@ -0,0 +1,539 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "date"
|
|
4
|
+
|
|
5
|
+
module Scrapetor
|
|
6
|
+
# Featherweight node wrapper. Holds a document reference and a backing
|
|
7
|
+
# Nokolexbor element. Selector ops delegate to the backing engine in
|
|
8
|
+
# Phase 1; the native extension (Phase 2) replaces this with arena-DOM
|
|
9
|
+
# + bytecode VM.
|
|
10
|
+
class Node
|
|
11
|
+
def initialize(doc, backing)
|
|
12
|
+
@doc = doc
|
|
13
|
+
@nlx = backing
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def text
|
|
17
|
+
@nlx.text
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def clean_text
|
|
21
|
+
Cleaner.clean(text)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def visible_text
|
|
25
|
+
stripped = @nlx.dup
|
|
26
|
+
stripped.css("script, style, noscript").each(&:remove) if stripped.respond_to?(:css)
|
|
27
|
+
Cleaner.clean(stripped.text)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def inner_html
|
|
31
|
+
@nlx.inner_html
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def outer_html
|
|
35
|
+
@nlx.to_html
|
|
36
|
+
end
|
|
37
|
+
alias to_html outer_html
|
|
38
|
+
|
|
39
|
+
def name
|
|
40
|
+
@nlx.name
|
|
41
|
+
end
|
|
42
|
+
alias node_name name
|
|
43
|
+
alias tag_name name
|
|
44
|
+
|
|
45
|
+
# Nokogiri-compat: `content` and `inner_text` are aliases for `text`.
|
|
46
|
+
alias content text
|
|
47
|
+
alias inner_text text
|
|
48
|
+
|
|
49
|
+
# Nokogiri-compat: return all element attributes as a Hash.
|
|
50
|
+
def attributes
|
|
51
|
+
h = {}
|
|
52
|
+
@nlx.attribute_nodes.each { |a| h[a.name] = a.value }
|
|
53
|
+
h
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def keys
|
|
57
|
+
@nlx.attribute_nodes.map(&:name)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def values
|
|
61
|
+
@nlx.attribute_nodes.map(&:value)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def has_attribute?(name)
|
|
65
|
+
!@nlx[name.to_s].nil?
|
|
66
|
+
end
|
|
67
|
+
alias key? has_attribute?
|
|
68
|
+
alias attribute? has_attribute?
|
|
69
|
+
|
|
70
|
+
def element?
|
|
71
|
+
@nlx.respond_to?(:element?) ? @nlx.element? : true
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def document?; false; end
|
|
75
|
+
|
|
76
|
+
# Iterate over attributes as Nokogiri does.
|
|
77
|
+
def each_attribute
|
|
78
|
+
return enum_for(:each_attribute) unless block_given?
|
|
79
|
+
@nlx.attribute_nodes.each { |a| yield [a.name, a.value] }
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def attr(key)
|
|
83
|
+
@nlx[key.to_s]
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def [](key)
|
|
87
|
+
@nlx[key.to_s]
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def absolute_url(base = nil)
|
|
91
|
+
href = @nlx["href"] || @nlx["src"]
|
|
92
|
+
URL.absolute(href, base || @doc.base_url)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def money
|
|
96
|
+
Money.parse(text)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def number
|
|
100
|
+
v = text.to_s.gsub(/[^\d.\-]/, "")
|
|
101
|
+
return nil if v.empty? || v == "-"
|
|
102
|
+
v.include?(".") ? v.to_f : v.to_i
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def date
|
|
106
|
+
Date.parse(text.to_s)
|
|
107
|
+
rescue ArgumentError, TypeError
|
|
108
|
+
nil
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Nokogiri-compat: `node.css(selector, ns_or_handler)`. Extra args
|
|
112
|
+
# are XPath-only and harmless to ignore for CSS.
|
|
113
|
+
def css(selector, *_extra)
|
|
114
|
+
result = @nlx.css(selector)
|
|
115
|
+
# `::text` / `::attr(name)` queries hand back a flat Array of
|
|
116
|
+
# String/TextNode. Pass that through as-is. For everything else
|
|
117
|
+
# — including the empty-NodeSet case — wrap in a NodeSet so the
|
|
118
|
+
# caller can chain `.at_css`, `.each_with_index`, etc. Detect the
|
|
119
|
+
# pseudo-element shape by checking the selector string; relying
|
|
120
|
+
# on the result shape would mis-classify zero-match queries.
|
|
121
|
+
if result.is_a?(Array) && selector_pseudo_element?(selector)
|
|
122
|
+
return result
|
|
123
|
+
end
|
|
124
|
+
NodeSet.new(@doc, result.to_a)
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def at(selector, *_extra)
|
|
128
|
+
n = @nlx.at_css(selector)
|
|
129
|
+
return n if n.is_a?(String)
|
|
130
|
+
n && Node.new(@doc, n)
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
private
|
|
134
|
+
|
|
135
|
+
def selector_pseudo_element?(sel)
|
|
136
|
+
s = sel.to_s
|
|
137
|
+
s.include?("::") && s =~ /::(?:text|attr\([^)]+\)|first-letter|first-line|before|after)\s*\z/i
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
public
|
|
141
|
+
alias at_css at
|
|
142
|
+
alias search css
|
|
143
|
+
|
|
144
|
+
# Native C versions of Node#at and Node#css are installed by
|
|
145
|
+
# native_dom.rb after the Native extension module is loaded —
|
|
146
|
+
# they aren't available at this point in the require chain.
|
|
147
|
+
|
|
148
|
+
# Batch API: array of selector strings → array of results,
|
|
149
|
+
# one C round-trip total. Delegates to the underlying Element's
|
|
150
|
+
# batch_css; falls back to N individual css() calls if the
|
|
151
|
+
# backing node doesn't expose batch.
|
|
152
|
+
def batch_css(selectors)
|
|
153
|
+
if @nlx.respond_to?(:batch_css)
|
|
154
|
+
results = @nlx.batch_css(selectors)
|
|
155
|
+
results.map do |r|
|
|
156
|
+
case r
|
|
157
|
+
when Array
|
|
158
|
+
# ::text / ::attr results — array of strings; pass through.
|
|
159
|
+
# Element arrays — wrap in NodeSet.
|
|
160
|
+
if r.empty? || r.first.is_a?(String)
|
|
161
|
+
r
|
|
162
|
+
else
|
|
163
|
+
NodeSet.new(@doc, r)
|
|
164
|
+
end
|
|
165
|
+
else
|
|
166
|
+
r # NodeSet or other
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
else
|
|
170
|
+
selectors.map { |s| css(s) }
|
|
171
|
+
end
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
# Hash-form batch: {key => selector} → {key => result}.
|
|
175
|
+
def extract_css(map)
|
|
176
|
+
keys = map.keys
|
|
177
|
+
results = batch_css(map.values)
|
|
178
|
+
out = {}
|
|
179
|
+
keys.each_with_index { |k, i| out[k] = results[i] }
|
|
180
|
+
out
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
# Per-result extract: routes to the underlying Element's C-side
|
|
184
|
+
# extract entry point (one C call assembles the whole hash).
|
|
185
|
+
def extract(map)
|
|
186
|
+
if @nlx.respond_to?(:extract)
|
|
187
|
+
@nlx.extract(map)
|
|
188
|
+
else
|
|
189
|
+
out = {}
|
|
190
|
+
map.each_pair { |k, sel| out[k] = at_css(sel) }
|
|
191
|
+
out
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# extract_each: under this node, run the outer + inner field
|
|
196
|
+
# plans entirely in C. One round-trip, Array<Hash> back.
|
|
197
|
+
def extract_each(outer_selector, fields)
|
|
198
|
+
if @nlx.respond_to?(:extract_each)
|
|
199
|
+
@nlx.extract_each(outer_selector, fields)
|
|
200
|
+
else
|
|
201
|
+
css(outer_selector).map { |n| n.extract(fields) }
|
|
202
|
+
end
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
def children
|
|
206
|
+
kids = @nlx.children.to_a.select { |c| c.respond_to?(:element?) && c.element? }
|
|
207
|
+
NodeSet.new(@doc, kids)
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
def parent
|
|
211
|
+
p = @nlx.parent
|
|
212
|
+
return nil if p.nil? || (defined?(Dom::Document) && p.is_a?(Dom::Document))
|
|
213
|
+
Node.new(@doc, p)
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
# Nokogiri-compatible: returns the literal next node (may be a text /
|
|
217
|
+
# comment node). Use `next_element_sibling` (or `next_element`) to skip
|
|
218
|
+
# non-element siblings.
|
|
219
|
+
def next_sibling
|
|
220
|
+
sib = @nlx.next_sibling
|
|
221
|
+
sib && Node.new(@doc, sib)
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
def previous_sibling
|
|
225
|
+
sib = @nlx.previous_sibling
|
|
226
|
+
sib && Node.new(@doc, sib)
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
def next_element_sibling
|
|
230
|
+
sib = @nlx.next_sibling
|
|
231
|
+
while sib && !(sib.respond_to?(:element?) && sib.element?)
|
|
232
|
+
sib = sib.next_sibling
|
|
233
|
+
end
|
|
234
|
+
sib && Node.new(@doc, sib)
|
|
235
|
+
end
|
|
236
|
+
alias next_element next_element_sibling
|
|
237
|
+
|
|
238
|
+
def previous_element_sibling
|
|
239
|
+
sib = @nlx.previous_sibling
|
|
240
|
+
while sib && !(sib.respond_to?(:element?) && sib.element?)
|
|
241
|
+
sib = sib.previous_sibling
|
|
242
|
+
end
|
|
243
|
+
sib && Node.new(@doc, sib)
|
|
244
|
+
end
|
|
245
|
+
alias previous_element previous_element_sibling
|
|
246
|
+
|
|
247
|
+
def fingerprint
|
|
248
|
+
Fingerprint.structural(self)
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
def backing_node
|
|
252
|
+
@nlx
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
# ----- Mutation API (delegated to Nokolexbor) -----
|
|
256
|
+
|
|
257
|
+
def []=(key, value)
|
|
258
|
+
@nlx[key.to_s] = value.nil? ? nil : value.to_s
|
|
259
|
+
value
|
|
260
|
+
end
|
|
261
|
+
alias set_attribute []=
|
|
262
|
+
|
|
263
|
+
def get_attribute(key)
|
|
264
|
+
@nlx[key.to_s]
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
def remove_attribute(key)
|
|
268
|
+
@nlx.remove_attribute(key.to_s)
|
|
269
|
+
self
|
|
270
|
+
end
|
|
271
|
+
alias delete_attribute remove_attribute
|
|
272
|
+
|
|
273
|
+
def content=(text)
|
|
274
|
+
@nlx.content = text.to_s
|
|
275
|
+
text
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
def inner_html=(html)
|
|
279
|
+
@nlx.inner_html = html.to_s
|
|
280
|
+
html
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
def add_child(node_or_html)
|
|
284
|
+
wrap_result(@nlx.add_child(unwrap_mut(node_or_html)))
|
|
285
|
+
end
|
|
286
|
+
alias << add_child
|
|
287
|
+
alias add_child! add_child
|
|
288
|
+
|
|
289
|
+
def add_previous_sibling(node_or_html)
|
|
290
|
+
wrap_result(@nlx.add_previous_sibling(unwrap_mut(node_or_html)))
|
|
291
|
+
end
|
|
292
|
+
alias before add_previous_sibling
|
|
293
|
+
|
|
294
|
+
def add_next_sibling(node_or_html)
|
|
295
|
+
wrap_result(@nlx.add_next_sibling(unwrap_mut(node_or_html)))
|
|
296
|
+
end
|
|
297
|
+
alias after add_next_sibling
|
|
298
|
+
|
|
299
|
+
def replace(node_or_html)
|
|
300
|
+
wrap_result(@nlx.replace(unwrap_mut(node_or_html)))
|
|
301
|
+
end
|
|
302
|
+
alias replace_with replace
|
|
303
|
+
alias swap replace
|
|
304
|
+
|
|
305
|
+
def remove
|
|
306
|
+
@nlx.remove
|
|
307
|
+
self
|
|
308
|
+
end
|
|
309
|
+
alias unlink remove
|
|
310
|
+
alias delete remove
|
|
311
|
+
|
|
312
|
+
# ----- Class manipulation -----
|
|
313
|
+
|
|
314
|
+
def add_class(klass)
|
|
315
|
+
@nlx.add_class(klass.to_s)
|
|
316
|
+
self
|
|
317
|
+
end
|
|
318
|
+
alias append_class add_class
|
|
319
|
+
|
|
320
|
+
def remove_class(klass = nil)
|
|
321
|
+
if klass.nil?
|
|
322
|
+
@nlx.remove_attribute("class")
|
|
323
|
+
else
|
|
324
|
+
@nlx.remove_class(klass.to_s)
|
|
325
|
+
end
|
|
326
|
+
self
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
def classes
|
|
330
|
+
(@nlx["class"] || "").split(/\s+/).reject(&:empty?)
|
|
331
|
+
end
|
|
332
|
+
|
|
333
|
+
def has_class?(klass)
|
|
334
|
+
classes.include?(klass.to_s)
|
|
335
|
+
end
|
|
336
|
+
|
|
337
|
+
# ----- Extra Nokogiri-compat aliases -----
|
|
338
|
+
|
|
339
|
+
alias prev previous_sibling
|
|
340
|
+
alias previous previous_sibling
|
|
341
|
+
alias next next_sibling
|
|
342
|
+
|
|
343
|
+
def first_element_child
|
|
344
|
+
c = @nlx.children.to_a.find { |x| x.respond_to?(:element?) && x.element? }
|
|
345
|
+
c && Node.new(@doc, c)
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
# Nokogiri-compat: `Node#child` returns the first child regardless
|
|
349
|
+
# of node type (text / element / comment). Used by parsers that
|
|
350
|
+
# poke at the immediate inner content (e.g. heading nodes whose
|
|
351
|
+
# text lives in a text-node child).
|
|
352
|
+
def child
|
|
353
|
+
c = @nlx.children.to_a.first
|
|
354
|
+
c && Node.new(@doc, c)
|
|
355
|
+
end
|
|
356
|
+
|
|
357
|
+
def last_element_child
|
|
358
|
+
c = @nlx.children.to_a.reverse.find { |x| x.respond_to?(:element?) && x.element? }
|
|
359
|
+
c && Node.new(@doc, c)
|
|
360
|
+
end
|
|
361
|
+
|
|
362
|
+
def element_children
|
|
363
|
+
kids = @nlx.children.to_a.select { |x| x.respond_to?(:element?) && x.element? }
|
|
364
|
+
NodeSet.new(@doc, kids)
|
|
365
|
+
end
|
|
366
|
+
alias elements element_children
|
|
367
|
+
|
|
368
|
+
def node_type
|
|
369
|
+
@nlx.respond_to?(:node_type) ? @nlx.node_type : 1
|
|
370
|
+
end
|
|
371
|
+
alias type node_type
|
|
372
|
+
|
|
373
|
+
def path
|
|
374
|
+
@nlx.path if @nlx.respond_to?(:path)
|
|
375
|
+
end
|
|
376
|
+
|
|
377
|
+
# Build a minimal CSS path back to this node (id-based when
|
|
378
|
+
# available, falling back to tag + :nth-of-type indexing).
|
|
379
|
+
def css_path
|
|
380
|
+
parts = []
|
|
381
|
+
cur = @nlx
|
|
382
|
+
while cur && cur.respond_to?(:name) && cur.element?
|
|
383
|
+
if (id = cur["id"]) && !id.empty?
|
|
384
|
+
parts.unshift("##{id}")
|
|
385
|
+
break
|
|
386
|
+
end
|
|
387
|
+
index = 1
|
|
388
|
+
sib = cur.previous_sibling
|
|
389
|
+
while sib
|
|
390
|
+
index += 1 if sib.respond_to?(:element?) && sib.element? && sib.name == cur.name
|
|
391
|
+
sib = sib.previous_sibling
|
|
392
|
+
end
|
|
393
|
+
parts.unshift("#{cur.name}:nth-of-type(#{index})")
|
|
394
|
+
cur = cur.parent
|
|
395
|
+
end
|
|
396
|
+
parts.join(" > ")
|
|
397
|
+
end
|
|
398
|
+
|
|
399
|
+
# XPath path to this node.
|
|
400
|
+
def xpath_path
|
|
401
|
+
path
|
|
402
|
+
end
|
|
403
|
+
|
|
404
|
+
def traverse(&block)
|
|
405
|
+
return enum_for(:traverse) unless block_given?
|
|
406
|
+
yield self
|
|
407
|
+
element_children.each { |c| c.traverse(&block) }
|
|
408
|
+
end
|
|
409
|
+
|
|
410
|
+
def ancestors(selector = nil)
|
|
411
|
+
list = []
|
|
412
|
+
cur = parent
|
|
413
|
+
while cur
|
|
414
|
+
list << cur
|
|
415
|
+
cur = cur.parent
|
|
416
|
+
end
|
|
417
|
+
result = NodeSet.new(@doc, list.map(&:backing_node))
|
|
418
|
+
selector.nil? ? result : result.select { |n| n.matches?(selector) }
|
|
419
|
+
end
|
|
420
|
+
|
|
421
|
+
def matches?(selector)
|
|
422
|
+
ns = @doc.css(selector)
|
|
423
|
+
ns.to_a.any? { |n| n.backing_node == @nlx }
|
|
424
|
+
end
|
|
425
|
+
|
|
426
|
+
# XPath helpers. The native engine doesn't yet implement XPath, so we
|
|
427
|
+
# return empty results rather than NoMethodError on Node — this keeps
|
|
428
|
+
# callers that probe both engines from crashing.
|
|
429
|
+
# Evaluate an XPath expression against this node (relative
|
|
430
|
+
# expressions are scoped to it). See Scrapetor::Document#xpath
|
|
431
|
+
# for the supported subset.
|
|
432
|
+
def xpath(expr)
|
|
433
|
+
Scrapetor::XPath.evaluate(self, expr)
|
|
434
|
+
end
|
|
435
|
+
|
|
436
|
+
def at_xpath(expr)
|
|
437
|
+
result = xpath(expr)
|
|
438
|
+
result.is_a?(Array) ? result.first : result
|
|
439
|
+
end
|
|
440
|
+
|
|
441
|
+
def wrap(html_or_node)
|
|
442
|
+
if @nlx.respond_to?(:wrap)
|
|
443
|
+
@nlx.wrap(html_or_node)
|
|
444
|
+
end
|
|
445
|
+
self
|
|
446
|
+
end
|
|
447
|
+
|
|
448
|
+
def blank?
|
|
449
|
+
text.to_s.strip.empty?
|
|
450
|
+
end
|
|
451
|
+
|
|
452
|
+
def attribute_nodes
|
|
453
|
+
@nlx.attribute_nodes
|
|
454
|
+
end
|
|
455
|
+
|
|
456
|
+
def attribute(name)
|
|
457
|
+
@nlx.attribute_nodes.find { |a| a.name == name.to_s }
|
|
458
|
+
end
|
|
459
|
+
|
|
460
|
+
def to_xml(*args)
|
|
461
|
+
@nlx.to_html(*args)
|
|
462
|
+
end
|
|
463
|
+
alias to_str to_html
|
|
464
|
+
|
|
465
|
+
def comment?
|
|
466
|
+
node_type == 8
|
|
467
|
+
end
|
|
468
|
+
|
|
469
|
+
def text?
|
|
470
|
+
node_type == 3
|
|
471
|
+
end
|
|
472
|
+
alias text_node? text?
|
|
473
|
+
|
|
474
|
+
def cdata?
|
|
475
|
+
node_type == 4
|
|
476
|
+
end
|
|
477
|
+
|
|
478
|
+
def processing_instruction?
|
|
479
|
+
node_type == 7
|
|
480
|
+
end
|
|
481
|
+
|
|
482
|
+
def fragment?
|
|
483
|
+
false
|
|
484
|
+
end
|
|
485
|
+
|
|
486
|
+
def document
|
|
487
|
+
@doc
|
|
488
|
+
end
|
|
489
|
+
|
|
490
|
+
def root
|
|
491
|
+
@doc.root
|
|
492
|
+
end
|
|
493
|
+
|
|
494
|
+
def write_to(io, *args)
|
|
495
|
+
io.write(to_html(*args))
|
|
496
|
+
end
|
|
497
|
+
|
|
498
|
+
def serialize(*args)
|
|
499
|
+
to_html(*args)
|
|
500
|
+
end
|
|
501
|
+
|
|
502
|
+
def ==(other)
|
|
503
|
+
other.is_a?(Node) && @nlx == other.backing_node
|
|
504
|
+
end
|
|
505
|
+
alias eql? ==
|
|
506
|
+
|
|
507
|
+
def hash
|
|
508
|
+
@nlx.hash
|
|
509
|
+
end
|
|
510
|
+
|
|
511
|
+
private
|
|
512
|
+
|
|
513
|
+
def wrap_result(result)
|
|
514
|
+
return nil if result.nil?
|
|
515
|
+
case result
|
|
516
|
+
when Node
|
|
517
|
+
result
|
|
518
|
+
when Array
|
|
519
|
+
NodeSet.new(@doc, result)
|
|
520
|
+
else
|
|
521
|
+
if result.respond_to?(:element?) && result.element?
|
|
522
|
+
Node.new(@doc, result)
|
|
523
|
+
else
|
|
524
|
+
result
|
|
525
|
+
end
|
|
526
|
+
end
|
|
527
|
+
rescue StandardError
|
|
528
|
+
result
|
|
529
|
+
end
|
|
530
|
+
|
|
531
|
+
def unwrap_mut(node_or_html)
|
|
532
|
+
if node_or_html.is_a?(Node)
|
|
533
|
+
node_or_html.backing_node
|
|
534
|
+
else
|
|
535
|
+
node_or_html
|
|
536
|
+
end
|
|
537
|
+
end
|
|
538
|
+
end
|
|
539
|
+
end
|