trafilatura 0.3.7-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +190 -0
- data/README.md +173 -0
- data/lib/trafilatura/convenience.rb +60 -0
- data/lib/trafilatura/libtrafilatura_uniffi.so +0 -0
- data/lib/trafilatura/trafilatura_generated.rb +1325 -0
- data/lib/trafilatura/version.rb +5 -0
- data/lib/trafilatura.rb +5 -0
- metadata +67 -0
|
@@ -0,0 +1,1325 @@
|
|
|
1
|
+
# This file was autogenerated by some hot garbage in the `uniffi` crate.
|
|
2
|
+
# Trust me, you don't want to mess with it!
|
|
3
|
+
|
|
4
|
+
# Common helper code.
|
|
5
|
+
#
|
|
6
|
+
# Ideally this would live in a separate .rb file where it can be unittested etc
|
|
7
|
+
# in isolation, and perhaps even published as a re-useable package.
|
|
8
|
+
#
|
|
9
|
+
# However, it's important that the details of how this helper code works (e.g. the
|
|
10
|
+
# way that different builtin types are passed across the FFI) exactly match what's
|
|
11
|
+
# expected by the rust code on the other side of the interface. In practice right
|
|
12
|
+
# now that means coming from the exact some version of `uniffi` that was used to
|
|
13
|
+
# compile the rust component. The easiest way to ensure this is to bundle the Ruby
|
|
14
|
+
# helpers directly inline like we're doing here.
|
|
15
|
+
|
|
16
|
+
require 'ffi'
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
module Trafilatura
|
|
20
|
+
def self.uniffi_in_range(i, type_name, min, max)
|
|
21
|
+
raise TypeError, "no implicit conversion of #{i} into Integer" unless i.respond_to?(:to_int)
|
|
22
|
+
i = i.to_int
|
|
23
|
+
raise RangeError, "#{type_name} requires #{min} <= value < #{max}" unless (min <= i && i < max)
|
|
24
|
+
i
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def self.uniffi_utf8(v)
|
|
28
|
+
raise TypeError, "no implicit conversion of #{v} into String" unless v.respond_to?(:to_str)
|
|
29
|
+
v = v.to_str.encode(Encoding::UTF_8)
|
|
30
|
+
raise Encoding::InvalidByteSequenceError, "not a valid UTF-8 encoded string" unless v.valid_encoding?
|
|
31
|
+
v
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def self.uniffi_bytes(v)
|
|
35
|
+
raise TypeError, "no implicit conversion of #{v} into String" unless v.respond_to?(:to_str)
|
|
36
|
+
v.to_str
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
class RustBuffer < FFI::Struct
|
|
40
|
+
layout :capacity, :uint64,
|
|
41
|
+
:len, :uint64,
|
|
42
|
+
:data, :pointer
|
|
43
|
+
|
|
44
|
+
def self.alloc(size)
|
|
45
|
+
return Trafilatura.rust_call(:ffi_trafilatura_uniffi_rustbuffer_alloc, size)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def self.reserve(rbuf, additional)
|
|
49
|
+
return Trafilatura.rust_call(:ffi_trafilatura_uniffi_rustbuffer_reserve, rbuf, additional)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def free
|
|
53
|
+
Trafilatura.rust_call(:ffi_trafilatura_uniffi_rustbuffer_free, self)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def capacity
|
|
57
|
+
self[:capacity]
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def len
|
|
61
|
+
self[:len]
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def len=(value)
|
|
65
|
+
self[:len] = value
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def data
|
|
69
|
+
self[:data]
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def to_s
|
|
73
|
+
"RustBuffer(capacity=#{capacity}, len=#{len}, data=#{data.read_bytes len})"
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# The allocated buffer will be automatically freed if an error occurs, ensuring that
|
|
77
|
+
# we don't accidentally leak it.
|
|
78
|
+
def self.allocWithBuilder
|
|
79
|
+
builder = RustBufferBuilder.new
|
|
80
|
+
|
|
81
|
+
begin
|
|
82
|
+
yield builder
|
|
83
|
+
rescue => e
|
|
84
|
+
builder.discard
|
|
85
|
+
raise e
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# The RustBuffer will be freed once the context-manager exits, ensuring that we don't
|
|
90
|
+
# leak it even if an error occurs.
|
|
91
|
+
def consumeWithStream
|
|
92
|
+
stream = RustBufferStream.new self
|
|
93
|
+
|
|
94
|
+
yield stream
|
|
95
|
+
|
|
96
|
+
raise RuntimeError, 'junk data left in buffer after consuming' if stream.remaining != 0
|
|
97
|
+
ensure
|
|
98
|
+
free
|
|
99
|
+
end# The primitive String type.
|
|
100
|
+
|
|
101
|
+
def self.allocFromString(value)
|
|
102
|
+
RustBuffer.allocWithBuilder do |builder|
|
|
103
|
+
builder.write value.encode('utf-8')
|
|
104
|
+
return builder.finalize
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def consumeIntoString
|
|
109
|
+
consumeWithStream do |stream|
|
|
110
|
+
return stream.read(stream.remaining).force_encoding(Encoding::UTF_8)
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# The Record type ExtractResult.
|
|
115
|
+
|
|
116
|
+
def self.check_lower_TypeExtractResult(v)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
RustBuffer.check_lower_TypeMetadata(v.metadata)
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def self.alloc_from_TypeExtractResult(v)
|
|
125
|
+
RustBuffer.allocWithBuilder do |builder|
|
|
126
|
+
builder.write_TypeExtractResult(v)
|
|
127
|
+
return builder.finalize
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def consumeIntoTypeExtractResult
|
|
132
|
+
consumeWithStream do |stream|
|
|
133
|
+
return stream.readTypeExtractResult
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# The Record type ExtractionConfig.
|
|
138
|
+
|
|
139
|
+
def self.check_lower_TypeExtractionConfig(v)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
def self.alloc_from_TypeExtractionConfig(v)
|
|
147
|
+
RustBuffer.allocWithBuilder do |builder|
|
|
148
|
+
builder.write_TypeExtractionConfig(v)
|
|
149
|
+
return builder.finalize
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def consumeIntoTypeExtractionConfig
|
|
154
|
+
consumeWithStream do |stream|
|
|
155
|
+
return stream.readTypeExtractionConfig
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# The Record type ExtractionOptions.
|
|
160
|
+
|
|
161
|
+
def self.check_lower_TypeExtractionOptions(v)
|
|
162
|
+
RustBuffer.check_lower_TypeExtractionConfig(v.config)
|
|
163
|
+
RustBuffer.check_lower_Optionalstring(v.original_url)
|
|
164
|
+
RustBuffer.check_lower_Optionalstring(v.target_language)
|
|
165
|
+
|
|
166
|
+
RustBuffer.check_lower_TypeExtractionFocus(v.focus)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
RustBuffer.check_lower_Optionali64(v.max_tree_size)
|
|
174
|
+
RustBuffer.check_lower_Optionalstring(v.prune_selector)
|
|
175
|
+
RustBuffer.check_lower_TypeHtmlDateMode(v.html_date_mode)
|
|
176
|
+
RustBuffer.check_lower_Optionalstring(v.html_date_override)
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
def self.alloc_from_TypeExtractionOptions(v)
|
|
180
|
+
RustBuffer.allocWithBuilder do |builder|
|
|
181
|
+
builder.write_TypeExtractionOptions(v)
|
|
182
|
+
return builder.finalize
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
def consumeIntoTypeExtractionOptions
|
|
187
|
+
consumeWithStream do |stream|
|
|
188
|
+
return stream.readTypeExtractionOptions
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# The Record type Metadata.
|
|
193
|
+
|
|
194
|
+
def self.check_lower_TypeMetadata(v)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
RustBuffer.check_lower_Optionalstring(v.date)
|
|
202
|
+
RustBuffer.check_lower_Sequencestring(v.categories)
|
|
203
|
+
RustBuffer.check_lower_Sequencestring(v.tags)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
def self.alloc_from_TypeMetadata(v)
|
|
213
|
+
RustBuffer.allocWithBuilder do |builder|
|
|
214
|
+
builder.write_TypeMetadata(v)
|
|
215
|
+
return builder.finalize
|
|
216
|
+
end
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
def consumeIntoTypeMetadata
|
|
220
|
+
consumeWithStream do |stream|
|
|
221
|
+
return stream.readTypeMetadata
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
# The Enum type ExtractionFocus.
|
|
226
|
+
|
|
227
|
+
def self.check_lower_TypeExtractionFocus(v)
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
def self.alloc_from_TypeExtractionFocus(v)
|
|
231
|
+
RustBuffer.allocWithBuilder do |builder|
|
|
232
|
+
builder.write_TypeExtractionFocus(v)
|
|
233
|
+
return builder.finalize
|
|
234
|
+
end
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
def consumeIntoTypeExtractionFocus
|
|
238
|
+
consumeWithStream do |stream|
|
|
239
|
+
return stream.readTypeExtractionFocus
|
|
240
|
+
end
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
# The Enum type HtmlDateMode.
|
|
245
|
+
|
|
246
|
+
def self.check_lower_TypeHtmlDateMode(v)
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
def self.alloc_from_TypeHtmlDateMode(v)
|
|
250
|
+
RustBuffer.allocWithBuilder do |builder|
|
|
251
|
+
builder.write_TypeHtmlDateMode(v)
|
|
252
|
+
return builder.finalize
|
|
253
|
+
end
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
def consumeIntoTypeHtmlDateMode
|
|
257
|
+
consumeWithStream do |stream|
|
|
258
|
+
return stream.readTypeHtmlDateMode
|
|
259
|
+
end
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
# The Optional<T> type for i64.
|
|
266
|
+
|
|
267
|
+
def self.check_lower_Optionali64(v)
|
|
268
|
+
if not v.nil?
|
|
269
|
+
|
|
270
|
+
end
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
def self.alloc_from_Optionali64(v)
|
|
274
|
+
RustBuffer.allocWithBuilder do |builder|
|
|
275
|
+
builder.write_Optionali64(v)
|
|
276
|
+
return builder.finalize()
|
|
277
|
+
end
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
def consumeIntoOptionali64
|
|
281
|
+
consumeWithStream do |stream|
|
|
282
|
+
return stream.readOptionali64
|
|
283
|
+
end
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
# The Optional<T> type for string.
|
|
287
|
+
|
|
288
|
+
def self.check_lower_Optionalstring(v)
|
|
289
|
+
if not v.nil?
|
|
290
|
+
|
|
291
|
+
end
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
def self.alloc_from_Optionalstring(v)
|
|
295
|
+
RustBuffer.allocWithBuilder do |builder|
|
|
296
|
+
builder.write_Optionalstring(v)
|
|
297
|
+
return builder.finalize()
|
|
298
|
+
end
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
def consumeIntoOptionalstring
|
|
302
|
+
consumeWithStream do |stream|
|
|
303
|
+
return stream.readOptionalstring
|
|
304
|
+
end
|
|
305
|
+
end
|
|
306
|
+
|
|
307
|
+
# The Sequence<T> type for string.
|
|
308
|
+
|
|
309
|
+
def self.check_lower_Sequencestring(v)
|
|
310
|
+
v.each do |item|
|
|
311
|
+
|
|
312
|
+
end
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
def self.alloc_from_Sequencestring(v)
|
|
316
|
+
RustBuffer.allocWithBuilder do |builder|
|
|
317
|
+
builder.write_Sequencestring(v)
|
|
318
|
+
return builder.finalize()
|
|
319
|
+
end
|
|
320
|
+
end
|
|
321
|
+
|
|
322
|
+
def consumeIntoSequencestring
|
|
323
|
+
consumeWithStream do |stream|
|
|
324
|
+
return stream.readSequencestring
|
|
325
|
+
end
|
|
326
|
+
end
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
end
|
|
330
|
+
|
|
331
|
+
module UniFFILib
|
|
332
|
+
class ForeignBytes < FFI::Struct
|
|
333
|
+
layout :len, :int32,
|
|
334
|
+
:data, :pointer
|
|
335
|
+
|
|
336
|
+
def len
|
|
337
|
+
self[:len]
|
|
338
|
+
end
|
|
339
|
+
|
|
340
|
+
def data
|
|
341
|
+
self[:data]
|
|
342
|
+
end
|
|
343
|
+
|
|
344
|
+
def to_s
|
|
345
|
+
"ForeignBytes(len=#{len}, data=#{data.read_bytes(len)})"
|
|
346
|
+
end
|
|
347
|
+
end
|
|
348
|
+
end
|
|
349
|
+
|
|
350
|
+
private_constant :UniFFILib
|
|
351
|
+
|
|
352
|
+
# Helper for structured reading of values from a RustBuffer.
|
|
353
|
+
class RustBufferStream
|
|
354
|
+
|
|
355
|
+
def initialize(rbuf)
|
|
356
|
+
@rbuf = rbuf
|
|
357
|
+
@offset = 0
|
|
358
|
+
end
|
|
359
|
+
|
|
360
|
+
def remaining
|
|
361
|
+
@rbuf.len - @offset
|
|
362
|
+
end
|
|
363
|
+
|
|
364
|
+
def read(size)
|
|
365
|
+
raise InternalError, 'read past end of rust buffer' if @offset + size > @rbuf.len
|
|
366
|
+
|
|
367
|
+
data = @rbuf.data.get_bytes @offset, size
|
|
368
|
+
|
|
369
|
+
@offset += size
|
|
370
|
+
|
|
371
|
+
data
|
|
372
|
+
end
|
|
373
|
+
|
|
374
|
+
def readI32
|
|
375
|
+
unpack_from 4, 'l>'
|
|
376
|
+
end
|
|
377
|
+
|
|
378
|
+
def readI64
|
|
379
|
+
unpack_from 8, 'q>'
|
|
380
|
+
end
|
|
381
|
+
|
|
382
|
+
def readBool
|
|
383
|
+
v = unpack_from 1, 'c'
|
|
384
|
+
|
|
385
|
+
return false if v == 0
|
|
386
|
+
return true if v == 1
|
|
387
|
+
|
|
388
|
+
raise InternalError, 'Unexpected byte for Boolean type'
|
|
389
|
+
end
|
|
390
|
+
|
|
391
|
+
def readString
|
|
392
|
+
size = unpack_from 4, 'l>'
|
|
393
|
+
|
|
394
|
+
raise InternalError, 'Unexpected negative string length' if size.negative?
|
|
395
|
+
|
|
396
|
+
read(size).force_encoding(Encoding::UTF_8)
|
|
397
|
+
end
|
|
398
|
+
|
|
399
|
+
# The Record type ExtractResult.
|
|
400
|
+
|
|
401
|
+
def readTypeExtractResult
|
|
402
|
+
ExtractResult.new(
|
|
403
|
+
content_text: readString,
|
|
404
|
+
comments_text: readString,
|
|
405
|
+
content_html: readString,
|
|
406
|
+
comments_html: readString,
|
|
407
|
+
metadata: readTypeMetadata
|
|
408
|
+
)
|
|
409
|
+
end
|
|
410
|
+
|
|
411
|
+
# The Record type ExtractionConfig.
|
|
412
|
+
|
|
413
|
+
def readTypeExtractionConfig
|
|
414
|
+
ExtractionConfig.new(
|
|
415
|
+
min_extracted_size: readI32,
|
|
416
|
+
min_extracted_comment_size: readI32,
|
|
417
|
+
min_output_size: readI32,
|
|
418
|
+
min_output_comment_size: readI32
|
|
419
|
+
)
|
|
420
|
+
end
|
|
421
|
+
|
|
422
|
+
# The Record type ExtractionOptions.
|
|
423
|
+
|
|
424
|
+
def readTypeExtractionOptions
|
|
425
|
+
ExtractionOptions.new(
|
|
426
|
+
config: readTypeExtractionConfig,
|
|
427
|
+
original_url: readOptionalstring,
|
|
428
|
+
target_language: readOptionalstring,
|
|
429
|
+
enable_fallback: readBool,
|
|
430
|
+
focus: readTypeExtractionFocus,
|
|
431
|
+
exclude_comments: readBool,
|
|
432
|
+
exclude_tables: readBool,
|
|
433
|
+
include_images: readBool,
|
|
434
|
+
include_links: readBool,
|
|
435
|
+
deduplicate: readBool,
|
|
436
|
+
require_essential_metadata: readBool,
|
|
437
|
+
max_tree_size: readOptionali64,
|
|
438
|
+
prune_selector: readOptionalstring,
|
|
439
|
+
html_date_mode: readTypeHtmlDateMode,
|
|
440
|
+
html_date_override: readOptionalstring
|
|
441
|
+
)
|
|
442
|
+
end
|
|
443
|
+
|
|
444
|
+
# The Record type Metadata.
|
|
445
|
+
|
|
446
|
+
def readTypeMetadata
|
|
447
|
+
Metadata.new(
|
|
448
|
+
title: readString,
|
|
449
|
+
author: readString,
|
|
450
|
+
url: readString,
|
|
451
|
+
hostname: readString,
|
|
452
|
+
description: readString,
|
|
453
|
+
sitename: readString,
|
|
454
|
+
date: readOptionalstring,
|
|
455
|
+
categories: readSequencestring,
|
|
456
|
+
tags: readSequencestring,
|
|
457
|
+
id: readString,
|
|
458
|
+
fingerprint: readString,
|
|
459
|
+
license: readString,
|
|
460
|
+
language: readString,
|
|
461
|
+
image: readString,
|
|
462
|
+
page_type: readString
|
|
463
|
+
)
|
|
464
|
+
end
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
# The Enum type ExtractionFocus.
|
|
469
|
+
|
|
470
|
+
def readTypeExtractionFocus
|
|
471
|
+
variant = unpack_from 4, 'l>'
|
|
472
|
+
|
|
473
|
+
if variant == 1
|
|
474
|
+
return ExtractionFocus::BALANCED
|
|
475
|
+
end
|
|
476
|
+
if variant == 2
|
|
477
|
+
return ExtractionFocus::FAVOR_RECALL
|
|
478
|
+
end
|
|
479
|
+
if variant == 3
|
|
480
|
+
return ExtractionFocus::FAVOR_PRECISION
|
|
481
|
+
end
|
|
482
|
+
|
|
483
|
+
raise InternalError, 'Unexpected variant tag for TypeExtractionFocus'
|
|
484
|
+
end
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
# The Enum type HtmlDateMode.
|
|
491
|
+
|
|
492
|
+
def readTypeHtmlDateMode
|
|
493
|
+
variant = unpack_from 4, 'l>'
|
|
494
|
+
|
|
495
|
+
if variant == 1
|
|
496
|
+
return HtmlDateMode::AUTOMATIC
|
|
497
|
+
end
|
|
498
|
+
if variant == 2
|
|
499
|
+
return HtmlDateMode::FAST
|
|
500
|
+
end
|
|
501
|
+
if variant == 3
|
|
502
|
+
return HtmlDateMode::EXTENSIVE
|
|
503
|
+
end
|
|
504
|
+
if variant == 4
|
|
505
|
+
return HtmlDateMode::DISABLED
|
|
506
|
+
end
|
|
507
|
+
|
|
508
|
+
raise InternalError, 'Unexpected variant tag for TypeHtmlDateMode'
|
|
509
|
+
end
|
|
510
|
+
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
# The Error type TrafilaturaError
|
|
518
|
+
|
|
519
|
+
def readTypeTrafilaturaError
|
|
520
|
+
variant = unpack_from 4, 'l>'
|
|
521
|
+
|
|
522
|
+
if variant == 1
|
|
523
|
+
return TrafilaturaError::ParseError.new(
|
|
524
|
+
readString()
|
|
525
|
+
)
|
|
526
|
+
end
|
|
527
|
+
if variant == 2
|
|
528
|
+
return TrafilaturaError::LanguageMismatch.new(
|
|
529
|
+
readString(),
|
|
530
|
+
readString()
|
|
531
|
+
)
|
|
532
|
+
end
|
|
533
|
+
if variant == 3
|
|
534
|
+
return TrafilaturaError::InsufficientContent.new(
|
|
535
|
+
readString()
|
|
536
|
+
)
|
|
537
|
+
end
|
|
538
|
+
if variant == 4
|
|
539
|
+
return TrafilaturaError::MissingMetadata.new(
|
|
540
|
+
readString()
|
|
541
|
+
)
|
|
542
|
+
end
|
|
543
|
+
if variant == 5
|
|
544
|
+
return TrafilaturaError::DuplicateContent.new
|
|
545
|
+
end
|
|
546
|
+
if variant == 6
|
|
547
|
+
return TrafilaturaError::TreeTooLarge.new(
|
|
548
|
+
readI64()
|
|
549
|
+
)
|
|
550
|
+
end
|
|
551
|
+
|
|
552
|
+
raise InternalError, 'Unexpected variant tag for TypeTrafilaturaError'
|
|
553
|
+
end
|
|
554
|
+
|
|
555
|
+
|
|
556
|
+
# The Optional<T> type for i64.
|
|
557
|
+
|
|
558
|
+
def readOptionali64
|
|
559
|
+
flag = unpack_from 1, 'c'
|
|
560
|
+
|
|
561
|
+
if flag == 0
|
|
562
|
+
return nil
|
|
563
|
+
elsif flag == 1
|
|
564
|
+
return readI64
|
|
565
|
+
else
|
|
566
|
+
raise InternalError, 'Unexpected flag byte for Optionali64'
|
|
567
|
+
end
|
|
568
|
+
end
|
|
569
|
+
|
|
570
|
+
# The Optional<T> type for string.
|
|
571
|
+
|
|
572
|
+
def readOptionalstring
|
|
573
|
+
flag = unpack_from 1, 'c'
|
|
574
|
+
|
|
575
|
+
if flag == 0
|
|
576
|
+
return nil
|
|
577
|
+
elsif flag == 1
|
|
578
|
+
return readString
|
|
579
|
+
else
|
|
580
|
+
raise InternalError, 'Unexpected flag byte for Optionalstring'
|
|
581
|
+
end
|
|
582
|
+
end
|
|
583
|
+
|
|
584
|
+
# The Sequence<T> type for string.
|
|
585
|
+
|
|
586
|
+
def readSequencestring
|
|
587
|
+
count = unpack_from 4, 'l>'
|
|
588
|
+
|
|
589
|
+
raise InternalError, 'Unexpected negative sequence length' if count.negative?
|
|
590
|
+
|
|
591
|
+
items = []
|
|
592
|
+
|
|
593
|
+
count.times do
|
|
594
|
+
items.append readString
|
|
595
|
+
end
|
|
596
|
+
|
|
597
|
+
items
|
|
598
|
+
end
|
|
599
|
+
|
|
600
|
+
|
|
601
|
+
|
|
602
|
+
def unpack_from(size, format)
|
|
603
|
+
raise InternalError, 'read past end of rust buffer' if @offset + size > @rbuf.len
|
|
604
|
+
|
|
605
|
+
value = @rbuf.data.get_bytes(@offset, size).unpack format
|
|
606
|
+
|
|
607
|
+
@offset += size
|
|
608
|
+
|
|
609
|
+
# TODO: verify this
|
|
610
|
+
raise 'more than one element!!!' if value.size > 1
|
|
611
|
+
|
|
612
|
+
value[0]
|
|
613
|
+
end
|
|
614
|
+
end
|
|
615
|
+
|
|
616
|
+
private_constant :RustBufferStream
|
|
617
|
+
|
|
618
|
+
# Helper for structured writing of values into a RustBuffer.
|
|
619
|
+
class RustBufferBuilder
|
|
620
|
+
def initialize
|
|
621
|
+
@rust_buf = RustBuffer.alloc 16
|
|
622
|
+
@rust_buf.len = 0
|
|
623
|
+
end
|
|
624
|
+
|
|
625
|
+
def finalize
|
|
626
|
+
rbuf = @rust_buf
|
|
627
|
+
|
|
628
|
+
@rust_buf = nil
|
|
629
|
+
|
|
630
|
+
rbuf
|
|
631
|
+
end
|
|
632
|
+
|
|
633
|
+
def discard
|
|
634
|
+
return if @rust_buf.nil?
|
|
635
|
+
|
|
636
|
+
rbuf = finalize
|
|
637
|
+
rbuf.free
|
|
638
|
+
end
|
|
639
|
+
|
|
640
|
+
def write(value)
|
|
641
|
+
reserve(value.bytes.size) do
|
|
642
|
+
@rust_buf.data.put_array_of_char @rust_buf.len, value.bytes
|
|
643
|
+
end
|
|
644
|
+
end
|
|
645
|
+
|
|
646
|
+
def write_I32(v)
|
|
647
|
+
v = Trafilatura::uniffi_in_range(v, "i32", -2**31, 2**31)
|
|
648
|
+
pack_into(4, 'l>', v)
|
|
649
|
+
end
|
|
650
|
+
|
|
651
|
+
def write_I64(v)
|
|
652
|
+
v = Trafilatura::uniffi_in_range(v, "i64", -2**63, 2**63)
|
|
653
|
+
pack_into(8, 'q>', v)
|
|
654
|
+
end
|
|
655
|
+
|
|
656
|
+
def write_Bool(v)
|
|
657
|
+
pack_into(1, 'c', v ? 1 : 0)
|
|
658
|
+
end
|
|
659
|
+
|
|
660
|
+
def write_String(v)
|
|
661
|
+
v = Trafilatura::uniffi_utf8(v)
|
|
662
|
+
pack_into 4, 'l>', v.bytes.size
|
|
663
|
+
write v
|
|
664
|
+
end
|
|
665
|
+
|
|
666
|
+
# The Record type ExtractResult.
|
|
667
|
+
|
|
668
|
+
def write_TypeExtractResult(v)
|
|
669
|
+
self.write_String(v.content_text)
|
|
670
|
+
self.write_String(v.comments_text)
|
|
671
|
+
self.write_String(v.content_html)
|
|
672
|
+
self.write_String(v.comments_html)
|
|
673
|
+
self.write_TypeMetadata(v.metadata)
|
|
674
|
+
end
|
|
675
|
+
|
|
676
|
+
# The Record type ExtractionConfig.
|
|
677
|
+
|
|
678
|
+
def write_TypeExtractionConfig(v)
|
|
679
|
+
self.write_I32(v.min_extracted_size)
|
|
680
|
+
self.write_I32(v.min_extracted_comment_size)
|
|
681
|
+
self.write_I32(v.min_output_size)
|
|
682
|
+
self.write_I32(v.min_output_comment_size)
|
|
683
|
+
end
|
|
684
|
+
|
|
685
|
+
# The Record type ExtractionOptions.
|
|
686
|
+
|
|
687
|
+
def write_TypeExtractionOptions(v)
|
|
688
|
+
self.write_TypeExtractionConfig(v.config)
|
|
689
|
+
self.write_Optionalstring(v.original_url)
|
|
690
|
+
self.write_Optionalstring(v.target_language)
|
|
691
|
+
self.write_Bool(v.enable_fallback)
|
|
692
|
+
self.write_TypeExtractionFocus(v.focus)
|
|
693
|
+
self.write_Bool(v.exclude_comments)
|
|
694
|
+
self.write_Bool(v.exclude_tables)
|
|
695
|
+
self.write_Bool(v.include_images)
|
|
696
|
+
self.write_Bool(v.include_links)
|
|
697
|
+
self.write_Bool(v.deduplicate)
|
|
698
|
+
self.write_Bool(v.require_essential_metadata)
|
|
699
|
+
self.write_Optionali64(v.max_tree_size)
|
|
700
|
+
self.write_Optionalstring(v.prune_selector)
|
|
701
|
+
self.write_TypeHtmlDateMode(v.html_date_mode)
|
|
702
|
+
self.write_Optionalstring(v.html_date_override)
|
|
703
|
+
end
|
|
704
|
+
|
|
705
|
+
# The Record type Metadata.
|
|
706
|
+
|
|
707
|
+
def write_TypeMetadata(v)
|
|
708
|
+
self.write_String(v.title)
|
|
709
|
+
self.write_String(v.author)
|
|
710
|
+
self.write_String(v.url)
|
|
711
|
+
self.write_String(v.hostname)
|
|
712
|
+
self.write_String(v.description)
|
|
713
|
+
self.write_String(v.sitename)
|
|
714
|
+
self.write_Optionalstring(v.date)
|
|
715
|
+
self.write_Sequencestring(v.categories)
|
|
716
|
+
self.write_Sequencestring(v.tags)
|
|
717
|
+
self.write_String(v.id)
|
|
718
|
+
self.write_String(v.fingerprint)
|
|
719
|
+
self.write_String(v.license)
|
|
720
|
+
self.write_String(v.language)
|
|
721
|
+
self.write_String(v.image)
|
|
722
|
+
self.write_String(v.page_type)
|
|
723
|
+
end
|
|
724
|
+
|
|
725
|
+
# The Enum type ExtractionFocus.
|
|
726
|
+
|
|
727
|
+
def write_TypeExtractionFocus(v)
|
|
728
|
+
pack_into(4, 'l>', v)
|
|
729
|
+
end
|
|
730
|
+
|
|
731
|
+
|
|
732
|
+
# The Enum type HtmlDateMode.
|
|
733
|
+
|
|
734
|
+
def write_TypeHtmlDateMode(v)
|
|
735
|
+
pack_into(4, 'l>', v)
|
|
736
|
+
end
|
|
737
|
+
|
|
738
|
+
|
|
739
|
+
|
|
740
|
+
|
|
741
|
+
# The Optional<T> type for i64.
|
|
742
|
+
|
|
743
|
+
def write_Optionali64(v)
|
|
744
|
+
if v.nil?
|
|
745
|
+
pack_into(1, 'c', 0)
|
|
746
|
+
else
|
|
747
|
+
pack_into(1, 'c', 1)
|
|
748
|
+
self.write_I64(v)
|
|
749
|
+
end
|
|
750
|
+
end
|
|
751
|
+
|
|
752
|
+
# The Optional<T> type for string.
|
|
753
|
+
|
|
754
|
+
def write_Optionalstring(v)
|
|
755
|
+
if v.nil?
|
|
756
|
+
pack_into(1, 'c', 0)
|
|
757
|
+
else
|
|
758
|
+
pack_into(1, 'c', 1)
|
|
759
|
+
self.write_String(v)
|
|
760
|
+
end
|
|
761
|
+
end
|
|
762
|
+
|
|
763
|
+
# The Sequence<T> type for string.
|
|
764
|
+
|
|
765
|
+
def write_Sequencestring(items)
|
|
766
|
+
pack_into(4, 'l>', items.size)
|
|
767
|
+
|
|
768
|
+
items.each do |item|
|
|
769
|
+
self.write_String(item)
|
|
770
|
+
end
|
|
771
|
+
end
|
|
772
|
+
|
|
773
|
+
|
|
774
|
+
|
|
775
|
+
private
|
|
776
|
+
|
|
777
|
+
def reserve(num_bytes)
|
|
778
|
+
if @rust_buf.len + num_bytes > @rust_buf.capacity
|
|
779
|
+
@rust_buf = RustBuffer.reserve(@rust_buf, num_bytes)
|
|
780
|
+
end
|
|
781
|
+
|
|
782
|
+
yield
|
|
783
|
+
|
|
784
|
+
@rust_buf.len += num_bytes
|
|
785
|
+
end
|
|
786
|
+
|
|
787
|
+
def pack_into(size, format, value)
|
|
788
|
+
reserve(size) do
|
|
789
|
+
@rust_buf.data.put_array_of_char @rust_buf.len, [value].pack(format).bytes
|
|
790
|
+
end
|
|
791
|
+
end
|
|
792
|
+
end
|
|
793
|
+
|
|
794
|
+
private_constant :RustBufferBuilder
|
|
795
|
+
|
|
796
|
+
# Error definitions
|
|
797
|
+
class RustCallStatus < FFI::Struct
|
|
798
|
+
layout :code, :int8,
|
|
799
|
+
:error_buf, RustBuffer
|
|
800
|
+
|
|
801
|
+
def code
|
|
802
|
+
self[:code]
|
|
803
|
+
end
|
|
804
|
+
|
|
805
|
+
def error_buf
|
|
806
|
+
self[:error_buf]
|
|
807
|
+
end
|
|
808
|
+
|
|
809
|
+
def to_s
|
|
810
|
+
"RustCallStatus(code=#{self[:code]})"
|
|
811
|
+
end
|
|
812
|
+
end
|
|
813
|
+
|
|
814
|
+
# These match the values from the uniffi::rustcalls module
|
|
815
|
+
CALL_SUCCESS = 0
|
|
816
|
+
CALL_ERROR = 1
|
|
817
|
+
CALL_PANIC = 2
|
|
818
|
+
|
|
819
|
+
|
|
820
|
+
|
|
821
|
+
|
|
822
|
+
module TrafilaturaError
|
|
823
|
+
class ParseError < StandardError
|
|
824
|
+
def initialize(reason)
|
|
825
|
+
@reason = reason
|
|
826
|
+
super()
|
|
827
|
+
end
|
|
828
|
+
|
|
829
|
+
attr_reader :reason
|
|
830
|
+
|
|
831
|
+
|
|
832
|
+
def to_s
|
|
833
|
+
"#{self.class.name}(reason=#{@reason.inspect})"
|
|
834
|
+
end
|
|
835
|
+
end
|
|
836
|
+
class LanguageMismatch < StandardError
|
|
837
|
+
def initialize(expected, got)
|
|
838
|
+
@expected = expected
|
|
839
|
+
@got = got
|
|
840
|
+
super()
|
|
841
|
+
end
|
|
842
|
+
|
|
843
|
+
attr_reader :expected, :got
|
|
844
|
+
|
|
845
|
+
|
|
846
|
+
def to_s
|
|
847
|
+
"#{self.class.name}(expected=#{@expected.inspect}, got=#{@got.inspect})"
|
|
848
|
+
end
|
|
849
|
+
end
|
|
850
|
+
class InsufficientContent < StandardError
|
|
851
|
+
def initialize(reason)
|
|
852
|
+
@reason = reason
|
|
853
|
+
super()
|
|
854
|
+
end
|
|
855
|
+
|
|
856
|
+
attr_reader :reason
|
|
857
|
+
|
|
858
|
+
|
|
859
|
+
def to_s
|
|
860
|
+
"#{self.class.name}(reason=#{@reason.inspect})"
|
|
861
|
+
end
|
|
862
|
+
end
|
|
863
|
+
class MissingMetadata < StandardError
|
|
864
|
+
def initialize(reason)
|
|
865
|
+
@reason = reason
|
|
866
|
+
super()
|
|
867
|
+
end
|
|
868
|
+
|
|
869
|
+
attr_reader :reason
|
|
870
|
+
|
|
871
|
+
|
|
872
|
+
def to_s
|
|
873
|
+
"#{self.class.name}(reason=#{@reason.inspect})"
|
|
874
|
+
end
|
|
875
|
+
end
|
|
876
|
+
class DuplicateContent < StandardError
|
|
877
|
+
def initialize()
|
|
878
|
+
super()
|
|
879
|
+
end
|
|
880
|
+
|
|
881
|
+
def to_s
|
|
882
|
+
"#{self.class.name}()"
|
|
883
|
+
end
|
|
884
|
+
end
|
|
885
|
+
class TreeTooLarge < StandardError
|
|
886
|
+
def initialize(size)
|
|
887
|
+
@size = size
|
|
888
|
+
super()
|
|
889
|
+
end
|
|
890
|
+
|
|
891
|
+
attr_reader :size
|
|
892
|
+
|
|
893
|
+
|
|
894
|
+
def to_s
|
|
895
|
+
"#{self.class.name}(size=#{@size.inspect})"
|
|
896
|
+
end
|
|
897
|
+
end
|
|
898
|
+
|
|
899
|
+
end
|
|
900
|
+
|
|
901
|
+
|
|
902
|
+
# Map error modules to the RustBuffer method name that reads them
|
|
903
|
+
ERROR_MODULE_TO_READER_METHOD = {
|
|
904
|
+
|
|
905
|
+
|
|
906
|
+
|
|
907
|
+
TrafilaturaError => :readTypeTrafilaturaError,
|
|
908
|
+
|
|
909
|
+
}
|
|
910
|
+
|
|
911
|
+
private_constant :ERROR_MODULE_TO_READER_METHOD, :CALL_SUCCESS, :CALL_ERROR, :CALL_PANIC,
|
|
912
|
+
:RustCallStatus
|
|
913
|
+
|
|
914
|
+
def self.consume_buffer_into_error(error_module, rust_buffer)
|
|
915
|
+
rust_buffer.consumeWithStream do |stream|
|
|
916
|
+
reader_method = ERROR_MODULE_TO_READER_METHOD[error_module]
|
|
917
|
+
return stream.send(reader_method)
|
|
918
|
+
end
|
|
919
|
+
end
|
|
920
|
+
|
|
921
|
+
class InternalError < StandardError
|
|
922
|
+
end
|
|
923
|
+
|
|
924
|
+
def self.rust_call(fn_name, *args)
|
|
925
|
+
# Call a rust function
|
|
926
|
+
rust_call_with_error(nil, fn_name, *args)
|
|
927
|
+
end
|
|
928
|
+
|
|
929
|
+
def self.rust_call_with_error(error_module, fn_name, *args)
|
|
930
|
+
# Call a rust function and handle errors
|
|
931
|
+
#
|
|
932
|
+
# Use this when the rust function returns a Result<>. error_module must be the error_module that corresponds to that Result.
|
|
933
|
+
|
|
934
|
+
|
|
935
|
+
# Note: RustCallStatus.new zeroes out the struct, which is exactly what we
|
|
936
|
+
# want to pass to Rust (code=0, error_buf=RustBuffer(len=0, capacity=0,
|
|
937
|
+
# data=NULL))
|
|
938
|
+
status = RustCallStatus.new
|
|
939
|
+
args << status
|
|
940
|
+
|
|
941
|
+
result = UniFFILib.public_send(fn_name, *args)
|
|
942
|
+
|
|
943
|
+
case status.code
|
|
944
|
+
when CALL_SUCCESS
|
|
945
|
+
result
|
|
946
|
+
when CALL_ERROR
|
|
947
|
+
if error_module.nil?
|
|
948
|
+
status.error_buf.free
|
|
949
|
+
raise InternalError, "CALL_ERROR with no error_module set"
|
|
950
|
+
else
|
|
951
|
+
raise consume_buffer_into_error(error_module, status.error_buf)
|
|
952
|
+
end
|
|
953
|
+
when CALL_PANIC
|
|
954
|
+
# When the rust code sees a panic, it tries to construct a RustBuffer
|
|
955
|
+
# with the message. But if that code panics, then it just sends back
|
|
956
|
+
# an empty buffer.
|
|
957
|
+
if status.error_buf.len > 0
|
|
958
|
+
raise InternalError, status.error_buf.consumeIntoString()
|
|
959
|
+
else
|
|
960
|
+
raise InternalError, "Rust panic"
|
|
961
|
+
end
|
|
962
|
+
else
|
|
963
|
+
raise InternalError, "Unknown call status: #{status.code}"
|
|
964
|
+
end
|
|
965
|
+
end
|
|
966
|
+
|
|
967
|
+
private_class_method :consume_buffer_into_error
|
|
968
|
+
|
|
969
|
+
# This is how we find and load the dynamic library provided by the component.
|
|
970
|
+
# For now we just look it up by name.
|
|
971
|
+
module UniFFILib
|
|
972
|
+
extend FFI::Library
|
|
973
|
+
|
|
974
|
+
|
|
975
|
+
ffi_lib File.join(__dir__, 'libtrafilatura_uniffi.so')
|
|
976
|
+
|
|
977
|
+
|
|
978
|
+
attach_function :uniffi_trafilatura_uniffi_fn_func_create_readable_document,
|
|
979
|
+
[RustBuffer.by_value, RustCallStatus.by_ref],
|
|
980
|
+
RustBuffer.by_value
|
|
981
|
+
attach_function :uniffi_trafilatura_uniffi_fn_func_default_config,
|
|
982
|
+
[RustCallStatus.by_ref],
|
|
983
|
+
RustBuffer.by_value
|
|
984
|
+
attach_function :uniffi_trafilatura_uniffi_fn_func_default_options,
|
|
985
|
+
[RustCallStatus.by_ref],
|
|
986
|
+
RustBuffer.by_value
|
|
987
|
+
attach_function :uniffi_trafilatura_uniffi_fn_func_extract,
|
|
988
|
+
[RustBuffer.by_value, RustBuffer.by_value, RustCallStatus.by_ref],
|
|
989
|
+
RustBuffer.by_value
|
|
990
|
+
attach_function :uniffi_trafilatura_uniffi_fn_func_extract_simple,
|
|
991
|
+
[RustBuffer.by_value, RustCallStatus.by_ref],
|
|
992
|
+
RustBuffer.by_value
|
|
993
|
+
attach_function :ffi_trafilatura_uniffi_rustbuffer_alloc,
|
|
994
|
+
[:uint64, RustCallStatus.by_ref],
|
|
995
|
+
RustBuffer.by_value
|
|
996
|
+
attach_function :ffi_trafilatura_uniffi_rustbuffer_from_bytes,
|
|
997
|
+
[ForeignBytes, RustCallStatus.by_ref],
|
|
998
|
+
RustBuffer.by_value
|
|
999
|
+
attach_function :ffi_trafilatura_uniffi_rustbuffer_free,
|
|
1000
|
+
[RustBuffer.by_value, RustCallStatus.by_ref],
|
|
1001
|
+
:void
|
|
1002
|
+
attach_function :ffi_trafilatura_uniffi_rustbuffer_reserve,
|
|
1003
|
+
[RustBuffer.by_value, :uint64, RustCallStatus.by_ref],
|
|
1004
|
+
RustBuffer.by_value
|
|
1005
|
+
attach_function :uniffi_trafilatura_uniffi_checksum_func_create_readable_document,
|
|
1006
|
+
[RustCallStatus.by_ref],
|
|
1007
|
+
:uint16
|
|
1008
|
+
attach_function :uniffi_trafilatura_uniffi_checksum_func_default_config,
|
|
1009
|
+
[RustCallStatus.by_ref],
|
|
1010
|
+
:uint16
|
|
1011
|
+
attach_function :uniffi_trafilatura_uniffi_checksum_func_default_options,
|
|
1012
|
+
[RustCallStatus.by_ref],
|
|
1013
|
+
:uint16
|
|
1014
|
+
attach_function :uniffi_trafilatura_uniffi_checksum_func_extract,
|
|
1015
|
+
[RustCallStatus.by_ref],
|
|
1016
|
+
:uint16
|
|
1017
|
+
attach_function :uniffi_trafilatura_uniffi_checksum_func_extract_simple,
|
|
1018
|
+
[RustCallStatus.by_ref],
|
|
1019
|
+
:uint16
|
|
1020
|
+
attach_function :ffi_trafilatura_uniffi_uniffi_contract_version,
|
|
1021
|
+
[RustCallStatus.by_ref],
|
|
1022
|
+
:uint32
|
|
1023
|
+
|
|
1024
|
+
end
|
|
1025
|
+
|
|
1026
|
+
# Public interface members begin here.
|
|
1027
|
+
|
|
1028
|
+
|
|
1029
|
+
|
|
1030
|
+
|
|
1031
|
+
|
|
1032
|
+
class ExtractionFocus
|
|
1033
|
+
BALANCED = 1
|
|
1034
|
+
FAVOR_RECALL = 2
|
|
1035
|
+
FAVOR_PRECISION = 3
|
|
1036
|
+
|
|
1037
|
+
end
|
|
1038
|
+
|
|
1039
|
+
|
|
1040
|
+
|
|
1041
|
+
|
|
1042
|
+
|
|
1043
|
+
|
|
1044
|
+
class HtmlDateMode
|
|
1045
|
+
AUTOMATIC = 1
|
|
1046
|
+
FAST = 2
|
|
1047
|
+
EXTENSIVE = 3
|
|
1048
|
+
DISABLED = 4
|
|
1049
|
+
|
|
1050
|
+
end
|
|
1051
|
+
|
|
1052
|
+
|
|
1053
|
+
|
|
1054
|
+
|
|
1055
|
+
# Record type ExtractResult
|
|
1056
|
+
class ExtractResult
|
|
1057
|
+
attr_reader :content_text, :comments_text, :content_html, :comments_html, :metadata
|
|
1058
|
+
|
|
1059
|
+
def initialize(content_text:, comments_text:, content_html:, comments_html:, metadata:)
|
|
1060
|
+
@content_text = content_text
|
|
1061
|
+
@comments_text = comments_text
|
|
1062
|
+
@content_html = content_html
|
|
1063
|
+
@comments_html = comments_html
|
|
1064
|
+
@metadata = metadata
|
|
1065
|
+
end
|
|
1066
|
+
|
|
1067
|
+
def ==(other)
|
|
1068
|
+
if @content_text != other.content_text
|
|
1069
|
+
return false
|
|
1070
|
+
end
|
|
1071
|
+
if @comments_text != other.comments_text
|
|
1072
|
+
return false
|
|
1073
|
+
end
|
|
1074
|
+
if @content_html != other.content_html
|
|
1075
|
+
return false
|
|
1076
|
+
end
|
|
1077
|
+
if @comments_html != other.comments_html
|
|
1078
|
+
return false
|
|
1079
|
+
end
|
|
1080
|
+
if @metadata != other.metadata
|
|
1081
|
+
return false
|
|
1082
|
+
end
|
|
1083
|
+
|
|
1084
|
+
true
|
|
1085
|
+
end
|
|
1086
|
+
end
|
|
1087
|
+
|
|
1088
|
+
# Record type ExtractionConfig
|
|
1089
|
+
class ExtractionConfig
|
|
1090
|
+
attr_reader :min_extracted_size, :min_extracted_comment_size, :min_output_size, :min_output_comment_size
|
|
1091
|
+
|
|
1092
|
+
def initialize(min_extracted_size:, min_extracted_comment_size:, min_output_size:, min_output_comment_size:)
|
|
1093
|
+
@min_extracted_size = min_extracted_size
|
|
1094
|
+
@min_extracted_comment_size = min_extracted_comment_size
|
|
1095
|
+
@min_output_size = min_output_size
|
|
1096
|
+
@min_output_comment_size = min_output_comment_size
|
|
1097
|
+
end
|
|
1098
|
+
|
|
1099
|
+
def ==(other)
|
|
1100
|
+
if @min_extracted_size != other.min_extracted_size
|
|
1101
|
+
return false
|
|
1102
|
+
end
|
|
1103
|
+
if @min_extracted_comment_size != other.min_extracted_comment_size
|
|
1104
|
+
return false
|
|
1105
|
+
end
|
|
1106
|
+
if @min_output_size != other.min_output_size
|
|
1107
|
+
return false
|
|
1108
|
+
end
|
|
1109
|
+
if @min_output_comment_size != other.min_output_comment_size
|
|
1110
|
+
return false
|
|
1111
|
+
end
|
|
1112
|
+
|
|
1113
|
+
true
|
|
1114
|
+
end
|
|
1115
|
+
end
|
|
1116
|
+
|
|
1117
|
+
# Record type ExtractionOptions
|
|
1118
|
+
class ExtractionOptions
|
|
1119
|
+
attr_reader :config, :original_url, :target_language, :enable_fallback, :focus, :exclude_comments, :exclude_tables, :include_images, :include_links, :deduplicate, :require_essential_metadata, :max_tree_size, :prune_selector, :html_date_mode, :html_date_override
|
|
1120
|
+
|
|
1121
|
+
def initialize(config:, original_url:, target_language:, enable_fallback:, focus:, exclude_comments:, exclude_tables:, include_images:, include_links:, deduplicate:, require_essential_metadata:, max_tree_size:, prune_selector:, html_date_mode:, html_date_override:)
|
|
1122
|
+
@config = config
|
|
1123
|
+
@original_url = original_url
|
|
1124
|
+
@target_language = target_language
|
|
1125
|
+
@enable_fallback = enable_fallback
|
|
1126
|
+
@focus = focus
|
|
1127
|
+
@exclude_comments = exclude_comments
|
|
1128
|
+
@exclude_tables = exclude_tables
|
|
1129
|
+
@include_images = include_images
|
|
1130
|
+
@include_links = include_links
|
|
1131
|
+
@deduplicate = deduplicate
|
|
1132
|
+
@require_essential_metadata = require_essential_metadata
|
|
1133
|
+
@max_tree_size = max_tree_size
|
|
1134
|
+
@prune_selector = prune_selector
|
|
1135
|
+
@html_date_mode = html_date_mode
|
|
1136
|
+
@html_date_override = html_date_override
|
|
1137
|
+
end
|
|
1138
|
+
|
|
1139
|
+
def ==(other)
|
|
1140
|
+
if @config != other.config
|
|
1141
|
+
return false
|
|
1142
|
+
end
|
|
1143
|
+
if @original_url != other.original_url
|
|
1144
|
+
return false
|
|
1145
|
+
end
|
|
1146
|
+
if @target_language != other.target_language
|
|
1147
|
+
return false
|
|
1148
|
+
end
|
|
1149
|
+
if @enable_fallback != other.enable_fallback
|
|
1150
|
+
return false
|
|
1151
|
+
end
|
|
1152
|
+
if @focus != other.focus
|
|
1153
|
+
return false
|
|
1154
|
+
end
|
|
1155
|
+
if @exclude_comments != other.exclude_comments
|
|
1156
|
+
return false
|
|
1157
|
+
end
|
|
1158
|
+
if @exclude_tables != other.exclude_tables
|
|
1159
|
+
return false
|
|
1160
|
+
end
|
|
1161
|
+
if @include_images != other.include_images
|
|
1162
|
+
return false
|
|
1163
|
+
end
|
|
1164
|
+
if @include_links != other.include_links
|
|
1165
|
+
return false
|
|
1166
|
+
end
|
|
1167
|
+
if @deduplicate != other.deduplicate
|
|
1168
|
+
return false
|
|
1169
|
+
end
|
|
1170
|
+
if @require_essential_metadata != other.require_essential_metadata
|
|
1171
|
+
return false
|
|
1172
|
+
end
|
|
1173
|
+
if @max_tree_size != other.max_tree_size
|
|
1174
|
+
return false
|
|
1175
|
+
end
|
|
1176
|
+
if @prune_selector != other.prune_selector
|
|
1177
|
+
return false
|
|
1178
|
+
end
|
|
1179
|
+
if @html_date_mode != other.html_date_mode
|
|
1180
|
+
return false
|
|
1181
|
+
end
|
|
1182
|
+
if @html_date_override != other.html_date_override
|
|
1183
|
+
return false
|
|
1184
|
+
end
|
|
1185
|
+
|
|
1186
|
+
true
|
|
1187
|
+
end
|
|
1188
|
+
end
|
|
1189
|
+
|
|
1190
|
+
# Record type Metadata
|
|
1191
|
+
class Metadata
|
|
1192
|
+
attr_reader :title, :author, :url, :hostname, :description, :sitename, :date, :categories, :tags, :id, :fingerprint, :license, :language, :image, :page_type
|
|
1193
|
+
|
|
1194
|
+
def initialize(title:, author:, url:, hostname:, description:, sitename:, date:, categories:, tags:, id:, fingerprint:, license:, language:, image:, page_type:)
|
|
1195
|
+
@title = title
|
|
1196
|
+
@author = author
|
|
1197
|
+
@url = url
|
|
1198
|
+
@hostname = hostname
|
|
1199
|
+
@description = description
|
|
1200
|
+
@sitename = sitename
|
|
1201
|
+
@date = date
|
|
1202
|
+
@categories = categories
|
|
1203
|
+
@tags = tags
|
|
1204
|
+
@id = id
|
|
1205
|
+
@fingerprint = fingerprint
|
|
1206
|
+
@license = license
|
|
1207
|
+
@language = language
|
|
1208
|
+
@image = image
|
|
1209
|
+
@page_type = page_type
|
|
1210
|
+
end
|
|
1211
|
+
|
|
1212
|
+
def ==(other)
|
|
1213
|
+
if @title != other.title
|
|
1214
|
+
return false
|
|
1215
|
+
end
|
|
1216
|
+
if @author != other.author
|
|
1217
|
+
return false
|
|
1218
|
+
end
|
|
1219
|
+
if @url != other.url
|
|
1220
|
+
return false
|
|
1221
|
+
end
|
|
1222
|
+
if @hostname != other.hostname
|
|
1223
|
+
return false
|
|
1224
|
+
end
|
|
1225
|
+
if @description != other.description
|
|
1226
|
+
return false
|
|
1227
|
+
end
|
|
1228
|
+
if @sitename != other.sitename
|
|
1229
|
+
return false
|
|
1230
|
+
end
|
|
1231
|
+
if @date != other.date
|
|
1232
|
+
return false
|
|
1233
|
+
end
|
|
1234
|
+
if @categories != other.categories
|
|
1235
|
+
return false
|
|
1236
|
+
end
|
|
1237
|
+
if @tags != other.tags
|
|
1238
|
+
return false
|
|
1239
|
+
end
|
|
1240
|
+
if @id != other.id
|
|
1241
|
+
return false
|
|
1242
|
+
end
|
|
1243
|
+
if @fingerprint != other.fingerprint
|
|
1244
|
+
return false
|
|
1245
|
+
end
|
|
1246
|
+
if @license != other.license
|
|
1247
|
+
return false
|
|
1248
|
+
end
|
|
1249
|
+
if @language != other.language
|
|
1250
|
+
return false
|
|
1251
|
+
end
|
|
1252
|
+
if @image != other.image
|
|
1253
|
+
return false
|
|
1254
|
+
end
|
|
1255
|
+
if @page_type != other.page_type
|
|
1256
|
+
return false
|
|
1257
|
+
end
|
|
1258
|
+
|
|
1259
|
+
true
|
|
1260
|
+
end
|
|
1261
|
+
end
|
|
1262
|
+
|
|
1263
|
+
|
|
1264
|
+
|
|
1265
|
+
|
|
1266
|
+
|
|
1267
|
+
def self.create_readable_document(result)
|
|
1268
|
+
result = result
|
|
1269
|
+
RustBuffer.check_lower_TypeExtractResult(result)
|
|
1270
|
+
|
|
1271
|
+
result = Trafilatura.rust_call(:uniffi_trafilatura_uniffi_fn_func_create_readable_document,RustBuffer.alloc_from_TypeExtractResult(result))
|
|
1272
|
+
return result.consumeIntoString
|
|
1273
|
+
end
|
|
1274
|
+
|
|
1275
|
+
|
|
1276
|
+
|
|
1277
|
+
|
|
1278
|
+
|
|
1279
|
+
def self.default_config()
|
|
1280
|
+
result = Trafilatura.rust_call(:uniffi_trafilatura_uniffi_fn_func_default_config,)
|
|
1281
|
+
return result.consumeIntoTypeExtractionConfig
|
|
1282
|
+
end
|
|
1283
|
+
|
|
1284
|
+
|
|
1285
|
+
|
|
1286
|
+
|
|
1287
|
+
|
|
1288
|
+
def self.default_options()
|
|
1289
|
+
result = Trafilatura.rust_call(:uniffi_trafilatura_uniffi_fn_func_default_options,)
|
|
1290
|
+
return result.consumeIntoTypeExtractionOptions
|
|
1291
|
+
end
|
|
1292
|
+
|
|
1293
|
+
|
|
1294
|
+
|
|
1295
|
+
|
|
1296
|
+
|
|
1297
|
+
def self.extract(html, options)
|
|
1298
|
+
html = Trafilatura::uniffi_utf8(html)
|
|
1299
|
+
|
|
1300
|
+
|
|
1301
|
+
options = options
|
|
1302
|
+
RustBuffer.check_lower_TypeExtractionOptions(options)
|
|
1303
|
+
|
|
1304
|
+
result = Trafilatura.rust_call_with_error(TrafilaturaError,:uniffi_trafilatura_uniffi_fn_func_extract,RustBuffer.allocFromString(html),RustBuffer.alloc_from_TypeExtractionOptions(options))
|
|
1305
|
+
return result.consumeIntoTypeExtractResult
|
|
1306
|
+
end
|
|
1307
|
+
|
|
1308
|
+
|
|
1309
|
+
|
|
1310
|
+
|
|
1311
|
+
|
|
1312
|
+
def self.extract_simple(html)
|
|
1313
|
+
html = Trafilatura::uniffi_utf8(html)
|
|
1314
|
+
|
|
1315
|
+
|
|
1316
|
+
result = Trafilatura.rust_call_with_error(TrafilaturaError,:uniffi_trafilatura_uniffi_fn_func_extract_simple,RustBuffer.allocFromString(html))
|
|
1317
|
+
return result.consumeIntoTypeExtractResult
|
|
1318
|
+
end
|
|
1319
|
+
|
|
1320
|
+
|
|
1321
|
+
|
|
1322
|
+
|
|
1323
|
+
|
|
1324
|
+
end
|
|
1325
|
+
|