trafilatura 0.3.7-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1325 @@
1
+ # This file was autogenerated by some hot garbage in the `uniffi` crate.
2
+ # Trust me, you don't want to mess with it!
3
+
4
+ # Common helper code.
5
+ #
6
+ # Ideally this would live in a separate .rb file where it can be unittested etc
7
+ # in isolation, and perhaps even published as a re-useable package.
8
+ #
9
+ # However, it's important that the details of how this helper code works (e.g. the
10
+ # way that different builtin types are passed across the FFI) exactly match what's
11
+ # expected by the rust code on the other side of the interface. In practice right
12
+ # now that means coming from the exact some version of `uniffi` that was used to
13
+ # compile the rust component. The easiest way to ensure this is to bundle the Ruby
14
+ # helpers directly inline like we're doing here.
15
+
16
+ require 'ffi'
17
+
18
+
19
+ module Trafilatura
20
+ def self.uniffi_in_range(i, type_name, min, max)
21
+ raise TypeError, "no implicit conversion of #{i} into Integer" unless i.respond_to?(:to_int)
22
+ i = i.to_int
23
+ raise RangeError, "#{type_name} requires #{min} <= value < #{max}" unless (min <= i && i < max)
24
+ i
25
+ end
26
+
27
+ def self.uniffi_utf8(v)
28
+ raise TypeError, "no implicit conversion of #{v} into String" unless v.respond_to?(:to_str)
29
+ v = v.to_str.encode(Encoding::UTF_8)
30
+ raise Encoding::InvalidByteSequenceError, "not a valid UTF-8 encoded string" unless v.valid_encoding?
31
+ v
32
+ end
33
+
34
+ def self.uniffi_bytes(v)
35
+ raise TypeError, "no implicit conversion of #{v} into String" unless v.respond_to?(:to_str)
36
+ v.to_str
37
+ end
38
+
39
+ class RustBuffer < FFI::Struct
40
+ layout :capacity, :uint64,
41
+ :len, :uint64,
42
+ :data, :pointer
43
+
44
+ def self.alloc(size)
45
+ return Trafilatura.rust_call(:ffi_trafilatura_uniffi_rustbuffer_alloc, size)
46
+ end
47
+
48
+ def self.reserve(rbuf, additional)
49
+ return Trafilatura.rust_call(:ffi_trafilatura_uniffi_rustbuffer_reserve, rbuf, additional)
50
+ end
51
+
52
+ def free
53
+ Trafilatura.rust_call(:ffi_trafilatura_uniffi_rustbuffer_free, self)
54
+ end
55
+
56
+ def capacity
57
+ self[:capacity]
58
+ end
59
+
60
+ def len
61
+ self[:len]
62
+ end
63
+
64
+ def len=(value)
65
+ self[:len] = value
66
+ end
67
+
68
+ def data
69
+ self[:data]
70
+ end
71
+
72
+ def to_s
73
+ "RustBuffer(capacity=#{capacity}, len=#{len}, data=#{data.read_bytes len})"
74
+ end
75
+
76
+ # The allocated buffer will be automatically freed if an error occurs, ensuring that
77
+ # we don't accidentally leak it.
78
+ def self.allocWithBuilder
79
+ builder = RustBufferBuilder.new
80
+
81
+ begin
82
+ yield builder
83
+ rescue => e
84
+ builder.discard
85
+ raise e
86
+ end
87
+ end
88
+
89
+ # The RustBuffer will be freed once the context-manager exits, ensuring that we don't
90
+ # leak it even if an error occurs.
91
+ def consumeWithStream
92
+ stream = RustBufferStream.new self
93
+
94
+ yield stream
95
+
96
+ raise RuntimeError, 'junk data left in buffer after consuming' if stream.remaining != 0
97
+ ensure
98
+ free
99
+ end# The primitive String type.
100
+
101
+ def self.allocFromString(value)
102
+ RustBuffer.allocWithBuilder do |builder|
103
+ builder.write value.encode('utf-8')
104
+ return builder.finalize
105
+ end
106
+ end
107
+
108
+ def consumeIntoString
109
+ consumeWithStream do |stream|
110
+ return stream.read(stream.remaining).force_encoding(Encoding::UTF_8)
111
+ end
112
+ end
113
+
114
+ # The Record type ExtractResult.
115
+
116
+ def self.check_lower_TypeExtractResult(v)
117
+
118
+
119
+
120
+
121
+ RustBuffer.check_lower_TypeMetadata(v.metadata)
122
+ end
123
+
124
+ def self.alloc_from_TypeExtractResult(v)
125
+ RustBuffer.allocWithBuilder do |builder|
126
+ builder.write_TypeExtractResult(v)
127
+ return builder.finalize
128
+ end
129
+ end
130
+
131
+ def consumeIntoTypeExtractResult
132
+ consumeWithStream do |stream|
133
+ return stream.readTypeExtractResult
134
+ end
135
+ end
136
+
137
+ # The Record type ExtractionConfig.
138
+
139
+ def self.check_lower_TypeExtractionConfig(v)
140
+
141
+
142
+
143
+
144
+ end
145
+
146
+ def self.alloc_from_TypeExtractionConfig(v)
147
+ RustBuffer.allocWithBuilder do |builder|
148
+ builder.write_TypeExtractionConfig(v)
149
+ return builder.finalize
150
+ end
151
+ end
152
+
153
+ def consumeIntoTypeExtractionConfig
154
+ consumeWithStream do |stream|
155
+ return stream.readTypeExtractionConfig
156
+ end
157
+ end
158
+
159
+ # The Record type ExtractionOptions.
160
+
161
+ def self.check_lower_TypeExtractionOptions(v)
162
+ RustBuffer.check_lower_TypeExtractionConfig(v.config)
163
+ RustBuffer.check_lower_Optionalstring(v.original_url)
164
+ RustBuffer.check_lower_Optionalstring(v.target_language)
165
+
166
+ RustBuffer.check_lower_TypeExtractionFocus(v.focus)
167
+
168
+
169
+
170
+
171
+
172
+
173
+ RustBuffer.check_lower_Optionali64(v.max_tree_size)
174
+ RustBuffer.check_lower_Optionalstring(v.prune_selector)
175
+ RustBuffer.check_lower_TypeHtmlDateMode(v.html_date_mode)
176
+ RustBuffer.check_lower_Optionalstring(v.html_date_override)
177
+ end
178
+
179
+ def self.alloc_from_TypeExtractionOptions(v)
180
+ RustBuffer.allocWithBuilder do |builder|
181
+ builder.write_TypeExtractionOptions(v)
182
+ return builder.finalize
183
+ end
184
+ end
185
+
186
+ def consumeIntoTypeExtractionOptions
187
+ consumeWithStream do |stream|
188
+ return stream.readTypeExtractionOptions
189
+ end
190
+ end
191
+
192
+ # The Record type Metadata.
193
+
194
+ def self.check_lower_TypeMetadata(v)
195
+
196
+
197
+
198
+
199
+
200
+
201
+ RustBuffer.check_lower_Optionalstring(v.date)
202
+ RustBuffer.check_lower_Sequencestring(v.categories)
203
+ RustBuffer.check_lower_Sequencestring(v.tags)
204
+
205
+
206
+
207
+
208
+
209
+
210
+ end
211
+
212
+ def self.alloc_from_TypeMetadata(v)
213
+ RustBuffer.allocWithBuilder do |builder|
214
+ builder.write_TypeMetadata(v)
215
+ return builder.finalize
216
+ end
217
+ end
218
+
219
+ def consumeIntoTypeMetadata
220
+ consumeWithStream do |stream|
221
+ return stream.readTypeMetadata
222
+ end
223
+ end
224
+
225
+ # The Enum type ExtractionFocus.
226
+
227
+ def self.check_lower_TypeExtractionFocus(v)
228
+ end
229
+
230
+ def self.alloc_from_TypeExtractionFocus(v)
231
+ RustBuffer.allocWithBuilder do |builder|
232
+ builder.write_TypeExtractionFocus(v)
233
+ return builder.finalize
234
+ end
235
+ end
236
+
237
+ def consumeIntoTypeExtractionFocus
238
+ consumeWithStream do |stream|
239
+ return stream.readTypeExtractionFocus
240
+ end
241
+ end
242
+
243
+
244
+ # The Enum type HtmlDateMode.
245
+
246
+ def self.check_lower_TypeHtmlDateMode(v)
247
+ end
248
+
249
+ def self.alloc_from_TypeHtmlDateMode(v)
250
+ RustBuffer.allocWithBuilder do |builder|
251
+ builder.write_TypeHtmlDateMode(v)
252
+ return builder.finalize
253
+ end
254
+ end
255
+
256
+ def consumeIntoTypeHtmlDateMode
257
+ consumeWithStream do |stream|
258
+ return stream.readTypeHtmlDateMode
259
+ end
260
+ end
261
+
262
+
263
+
264
+
265
+ # The Optional<T> type for i64.
266
+
267
+ def self.check_lower_Optionali64(v)
268
+ if not v.nil?
269
+
270
+ end
271
+ end
272
+
273
+ def self.alloc_from_Optionali64(v)
274
+ RustBuffer.allocWithBuilder do |builder|
275
+ builder.write_Optionali64(v)
276
+ return builder.finalize()
277
+ end
278
+ end
279
+
280
+ def consumeIntoOptionali64
281
+ consumeWithStream do |stream|
282
+ return stream.readOptionali64
283
+ end
284
+ end
285
+
286
+ # The Optional<T> type for string.
287
+
288
+ def self.check_lower_Optionalstring(v)
289
+ if not v.nil?
290
+
291
+ end
292
+ end
293
+
294
+ def self.alloc_from_Optionalstring(v)
295
+ RustBuffer.allocWithBuilder do |builder|
296
+ builder.write_Optionalstring(v)
297
+ return builder.finalize()
298
+ end
299
+ end
300
+
301
+ def consumeIntoOptionalstring
302
+ consumeWithStream do |stream|
303
+ return stream.readOptionalstring
304
+ end
305
+ end
306
+
307
+ # The Sequence<T> type for string.
308
+
309
+ def self.check_lower_Sequencestring(v)
310
+ v.each do |item|
311
+
312
+ end
313
+ end
314
+
315
+ def self.alloc_from_Sequencestring(v)
316
+ RustBuffer.allocWithBuilder do |builder|
317
+ builder.write_Sequencestring(v)
318
+ return builder.finalize()
319
+ end
320
+ end
321
+
322
+ def consumeIntoSequencestring
323
+ consumeWithStream do |stream|
324
+ return stream.readSequencestring
325
+ end
326
+ end
327
+
328
+
329
+ end
330
+
331
+ module UniFFILib
332
+ class ForeignBytes < FFI::Struct
333
+ layout :len, :int32,
334
+ :data, :pointer
335
+
336
+ def len
337
+ self[:len]
338
+ end
339
+
340
+ def data
341
+ self[:data]
342
+ end
343
+
344
+ def to_s
345
+ "ForeignBytes(len=#{len}, data=#{data.read_bytes(len)})"
346
+ end
347
+ end
348
+ end
349
+
350
+ private_constant :UniFFILib
351
+
352
+ # Helper for structured reading of values from a RustBuffer.
353
+ class RustBufferStream
354
+
355
+ def initialize(rbuf)
356
+ @rbuf = rbuf
357
+ @offset = 0
358
+ end
359
+
360
+ def remaining
361
+ @rbuf.len - @offset
362
+ end
363
+
364
+ def read(size)
365
+ raise InternalError, 'read past end of rust buffer' if @offset + size > @rbuf.len
366
+
367
+ data = @rbuf.data.get_bytes @offset, size
368
+
369
+ @offset += size
370
+
371
+ data
372
+ end
373
+
374
+ def readI32
375
+ unpack_from 4, 'l>'
376
+ end
377
+
378
+ def readI64
379
+ unpack_from 8, 'q>'
380
+ end
381
+
382
+ def readBool
383
+ v = unpack_from 1, 'c'
384
+
385
+ return false if v == 0
386
+ return true if v == 1
387
+
388
+ raise InternalError, 'Unexpected byte for Boolean type'
389
+ end
390
+
391
+ def readString
392
+ size = unpack_from 4, 'l>'
393
+
394
+ raise InternalError, 'Unexpected negative string length' if size.negative?
395
+
396
+ read(size).force_encoding(Encoding::UTF_8)
397
+ end
398
+
399
+ # The Record type ExtractResult.
400
+
401
+ def readTypeExtractResult
402
+ ExtractResult.new(
403
+ content_text: readString,
404
+ comments_text: readString,
405
+ content_html: readString,
406
+ comments_html: readString,
407
+ metadata: readTypeMetadata
408
+ )
409
+ end
410
+
411
+ # The Record type ExtractionConfig.
412
+
413
+ def readTypeExtractionConfig
414
+ ExtractionConfig.new(
415
+ min_extracted_size: readI32,
416
+ min_extracted_comment_size: readI32,
417
+ min_output_size: readI32,
418
+ min_output_comment_size: readI32
419
+ )
420
+ end
421
+
422
+ # The Record type ExtractionOptions.
423
+
424
+ def readTypeExtractionOptions
425
+ ExtractionOptions.new(
426
+ config: readTypeExtractionConfig,
427
+ original_url: readOptionalstring,
428
+ target_language: readOptionalstring,
429
+ enable_fallback: readBool,
430
+ focus: readTypeExtractionFocus,
431
+ exclude_comments: readBool,
432
+ exclude_tables: readBool,
433
+ include_images: readBool,
434
+ include_links: readBool,
435
+ deduplicate: readBool,
436
+ require_essential_metadata: readBool,
437
+ max_tree_size: readOptionali64,
438
+ prune_selector: readOptionalstring,
439
+ html_date_mode: readTypeHtmlDateMode,
440
+ html_date_override: readOptionalstring
441
+ )
442
+ end
443
+
444
+ # The Record type Metadata.
445
+
446
+ def readTypeMetadata
447
+ Metadata.new(
448
+ title: readString,
449
+ author: readString,
450
+ url: readString,
451
+ hostname: readString,
452
+ description: readString,
453
+ sitename: readString,
454
+ date: readOptionalstring,
455
+ categories: readSequencestring,
456
+ tags: readSequencestring,
457
+ id: readString,
458
+ fingerprint: readString,
459
+ license: readString,
460
+ language: readString,
461
+ image: readString,
462
+ page_type: readString
463
+ )
464
+ end
465
+
466
+
467
+
468
+ # The Enum type ExtractionFocus.
469
+
470
+ def readTypeExtractionFocus
471
+ variant = unpack_from 4, 'l>'
472
+
473
+ if variant == 1
474
+ return ExtractionFocus::BALANCED
475
+ end
476
+ if variant == 2
477
+ return ExtractionFocus::FAVOR_RECALL
478
+ end
479
+ if variant == 3
480
+ return ExtractionFocus::FAVOR_PRECISION
481
+ end
482
+
483
+ raise InternalError, 'Unexpected variant tag for TypeExtractionFocus'
484
+ end
485
+
486
+
487
+
488
+
489
+
490
+ # The Enum type HtmlDateMode.
491
+
492
+ def readTypeHtmlDateMode
493
+ variant = unpack_from 4, 'l>'
494
+
495
+ if variant == 1
496
+ return HtmlDateMode::AUTOMATIC
497
+ end
498
+ if variant == 2
499
+ return HtmlDateMode::FAST
500
+ end
501
+ if variant == 3
502
+ return HtmlDateMode::EXTENSIVE
503
+ end
504
+ if variant == 4
505
+ return HtmlDateMode::DISABLED
506
+ end
507
+
508
+ raise InternalError, 'Unexpected variant tag for TypeHtmlDateMode'
509
+ end
510
+
511
+
512
+
513
+
514
+
515
+
516
+
517
+ # The Error type TrafilaturaError
518
+
519
+ def readTypeTrafilaturaError
520
+ variant = unpack_from 4, 'l>'
521
+
522
+ if variant == 1
523
+ return TrafilaturaError::ParseError.new(
524
+ readString()
525
+ )
526
+ end
527
+ if variant == 2
528
+ return TrafilaturaError::LanguageMismatch.new(
529
+ readString(),
530
+ readString()
531
+ )
532
+ end
533
+ if variant == 3
534
+ return TrafilaturaError::InsufficientContent.new(
535
+ readString()
536
+ )
537
+ end
538
+ if variant == 4
539
+ return TrafilaturaError::MissingMetadata.new(
540
+ readString()
541
+ )
542
+ end
543
+ if variant == 5
544
+ return TrafilaturaError::DuplicateContent.new
545
+ end
546
+ if variant == 6
547
+ return TrafilaturaError::TreeTooLarge.new(
548
+ readI64()
549
+ )
550
+ end
551
+
552
+ raise InternalError, 'Unexpected variant tag for TypeTrafilaturaError'
553
+ end
554
+
555
+
556
+ # The Optional<T> type for i64.
557
+
558
+ def readOptionali64
559
+ flag = unpack_from 1, 'c'
560
+
561
+ if flag == 0
562
+ return nil
563
+ elsif flag == 1
564
+ return readI64
565
+ else
566
+ raise InternalError, 'Unexpected flag byte for Optionali64'
567
+ end
568
+ end
569
+
570
+ # The Optional<T> type for string.
571
+
572
+ def readOptionalstring
573
+ flag = unpack_from 1, 'c'
574
+
575
+ if flag == 0
576
+ return nil
577
+ elsif flag == 1
578
+ return readString
579
+ else
580
+ raise InternalError, 'Unexpected flag byte for Optionalstring'
581
+ end
582
+ end
583
+
584
+ # The Sequence<T> type for string.
585
+
586
+ def readSequencestring
587
+ count = unpack_from 4, 'l>'
588
+
589
+ raise InternalError, 'Unexpected negative sequence length' if count.negative?
590
+
591
+ items = []
592
+
593
+ count.times do
594
+ items.append readString
595
+ end
596
+
597
+ items
598
+ end
599
+
600
+
601
+
602
+ def unpack_from(size, format)
603
+ raise InternalError, 'read past end of rust buffer' if @offset + size > @rbuf.len
604
+
605
+ value = @rbuf.data.get_bytes(@offset, size).unpack format
606
+
607
+ @offset += size
608
+
609
+ # TODO: verify this
610
+ raise 'more than one element!!!' if value.size > 1
611
+
612
+ value[0]
613
+ end
614
+ end
615
+
616
+ private_constant :RustBufferStream
617
+
618
+ # Helper for structured writing of values into a RustBuffer.
619
+ class RustBufferBuilder
620
+ def initialize
621
+ @rust_buf = RustBuffer.alloc 16
622
+ @rust_buf.len = 0
623
+ end
624
+
625
+ def finalize
626
+ rbuf = @rust_buf
627
+
628
+ @rust_buf = nil
629
+
630
+ rbuf
631
+ end
632
+
633
+ def discard
634
+ return if @rust_buf.nil?
635
+
636
+ rbuf = finalize
637
+ rbuf.free
638
+ end
639
+
640
+ def write(value)
641
+ reserve(value.bytes.size) do
642
+ @rust_buf.data.put_array_of_char @rust_buf.len, value.bytes
643
+ end
644
+ end
645
+
646
+ def write_I32(v)
647
+ v = Trafilatura::uniffi_in_range(v, "i32", -2**31, 2**31)
648
+ pack_into(4, 'l>', v)
649
+ end
650
+
651
+ def write_I64(v)
652
+ v = Trafilatura::uniffi_in_range(v, "i64", -2**63, 2**63)
653
+ pack_into(8, 'q>', v)
654
+ end
655
+
656
+ def write_Bool(v)
657
+ pack_into(1, 'c', v ? 1 : 0)
658
+ end
659
+
660
+ def write_String(v)
661
+ v = Trafilatura::uniffi_utf8(v)
662
+ pack_into 4, 'l>', v.bytes.size
663
+ write v
664
+ end
665
+
666
+ # The Record type ExtractResult.
667
+
668
+ def write_TypeExtractResult(v)
669
+ self.write_String(v.content_text)
670
+ self.write_String(v.comments_text)
671
+ self.write_String(v.content_html)
672
+ self.write_String(v.comments_html)
673
+ self.write_TypeMetadata(v.metadata)
674
+ end
675
+
676
+ # The Record type ExtractionConfig.
677
+
678
+ def write_TypeExtractionConfig(v)
679
+ self.write_I32(v.min_extracted_size)
680
+ self.write_I32(v.min_extracted_comment_size)
681
+ self.write_I32(v.min_output_size)
682
+ self.write_I32(v.min_output_comment_size)
683
+ end
684
+
685
+ # The Record type ExtractionOptions.
686
+
687
+ def write_TypeExtractionOptions(v)
688
+ self.write_TypeExtractionConfig(v.config)
689
+ self.write_Optionalstring(v.original_url)
690
+ self.write_Optionalstring(v.target_language)
691
+ self.write_Bool(v.enable_fallback)
692
+ self.write_TypeExtractionFocus(v.focus)
693
+ self.write_Bool(v.exclude_comments)
694
+ self.write_Bool(v.exclude_tables)
695
+ self.write_Bool(v.include_images)
696
+ self.write_Bool(v.include_links)
697
+ self.write_Bool(v.deduplicate)
698
+ self.write_Bool(v.require_essential_metadata)
699
+ self.write_Optionali64(v.max_tree_size)
700
+ self.write_Optionalstring(v.prune_selector)
701
+ self.write_TypeHtmlDateMode(v.html_date_mode)
702
+ self.write_Optionalstring(v.html_date_override)
703
+ end
704
+
705
+ # The Record type Metadata.
706
+
707
+ def write_TypeMetadata(v)
708
+ self.write_String(v.title)
709
+ self.write_String(v.author)
710
+ self.write_String(v.url)
711
+ self.write_String(v.hostname)
712
+ self.write_String(v.description)
713
+ self.write_String(v.sitename)
714
+ self.write_Optionalstring(v.date)
715
+ self.write_Sequencestring(v.categories)
716
+ self.write_Sequencestring(v.tags)
717
+ self.write_String(v.id)
718
+ self.write_String(v.fingerprint)
719
+ self.write_String(v.license)
720
+ self.write_String(v.language)
721
+ self.write_String(v.image)
722
+ self.write_String(v.page_type)
723
+ end
724
+
725
+ # The Enum type ExtractionFocus.
726
+
727
+ def write_TypeExtractionFocus(v)
728
+ pack_into(4, 'l>', v)
729
+ end
730
+
731
+
732
+ # The Enum type HtmlDateMode.
733
+
734
+ def write_TypeHtmlDateMode(v)
735
+ pack_into(4, 'l>', v)
736
+ end
737
+
738
+
739
+
740
+
741
+ # The Optional<T> type for i64.
742
+
743
+ def write_Optionali64(v)
744
+ if v.nil?
745
+ pack_into(1, 'c', 0)
746
+ else
747
+ pack_into(1, 'c', 1)
748
+ self.write_I64(v)
749
+ end
750
+ end
751
+
752
+ # The Optional<T> type for string.
753
+
754
+ def write_Optionalstring(v)
755
+ if v.nil?
756
+ pack_into(1, 'c', 0)
757
+ else
758
+ pack_into(1, 'c', 1)
759
+ self.write_String(v)
760
+ end
761
+ end
762
+
763
+ # The Sequence<T> type for string.
764
+
765
+ def write_Sequencestring(items)
766
+ pack_into(4, 'l>', items.size)
767
+
768
+ items.each do |item|
769
+ self.write_String(item)
770
+ end
771
+ end
772
+
773
+
774
+
775
+ private
776
+
777
+ def reserve(num_bytes)
778
+ if @rust_buf.len + num_bytes > @rust_buf.capacity
779
+ @rust_buf = RustBuffer.reserve(@rust_buf, num_bytes)
780
+ end
781
+
782
+ yield
783
+
784
+ @rust_buf.len += num_bytes
785
+ end
786
+
787
+ def pack_into(size, format, value)
788
+ reserve(size) do
789
+ @rust_buf.data.put_array_of_char @rust_buf.len, [value].pack(format).bytes
790
+ end
791
+ end
792
+ end
793
+
794
+ private_constant :RustBufferBuilder
795
+
796
+ # Error definitions
797
+ class RustCallStatus < FFI::Struct
798
+ layout :code, :int8,
799
+ :error_buf, RustBuffer
800
+
801
+ def code
802
+ self[:code]
803
+ end
804
+
805
+ def error_buf
806
+ self[:error_buf]
807
+ end
808
+
809
+ def to_s
810
+ "RustCallStatus(code=#{self[:code]})"
811
+ end
812
+ end
813
+
814
+ # These match the values from the uniffi::rustcalls module
815
+ CALL_SUCCESS = 0
816
+ CALL_ERROR = 1
817
+ CALL_PANIC = 2
818
+
819
+
820
+
821
+
822
+ module TrafilaturaError
823
+ class ParseError < StandardError
824
+ def initialize(reason)
825
+ @reason = reason
826
+ super()
827
+ end
828
+
829
+ attr_reader :reason
830
+
831
+
832
+ def to_s
833
+ "#{self.class.name}(reason=#{@reason.inspect})"
834
+ end
835
+ end
836
+ class LanguageMismatch < StandardError
837
+ def initialize(expected, got)
838
+ @expected = expected
839
+ @got = got
840
+ super()
841
+ end
842
+
843
+ attr_reader :expected, :got
844
+
845
+
846
+ def to_s
847
+ "#{self.class.name}(expected=#{@expected.inspect}, got=#{@got.inspect})"
848
+ end
849
+ end
850
+ class InsufficientContent < StandardError
851
+ def initialize(reason)
852
+ @reason = reason
853
+ super()
854
+ end
855
+
856
+ attr_reader :reason
857
+
858
+
859
+ def to_s
860
+ "#{self.class.name}(reason=#{@reason.inspect})"
861
+ end
862
+ end
863
+ class MissingMetadata < StandardError
864
+ def initialize(reason)
865
+ @reason = reason
866
+ super()
867
+ end
868
+
869
+ attr_reader :reason
870
+
871
+
872
+ def to_s
873
+ "#{self.class.name}(reason=#{@reason.inspect})"
874
+ end
875
+ end
876
+ class DuplicateContent < StandardError
877
+ def initialize()
878
+ super()
879
+ end
880
+
881
+ def to_s
882
+ "#{self.class.name}()"
883
+ end
884
+ end
885
+ class TreeTooLarge < StandardError
886
+ def initialize(size)
887
+ @size = size
888
+ super()
889
+ end
890
+
891
+ attr_reader :size
892
+
893
+
894
+ def to_s
895
+ "#{self.class.name}(size=#{@size.inspect})"
896
+ end
897
+ end
898
+
899
+ end
900
+
901
+
902
+ # Map error modules to the RustBuffer method name that reads them
903
+ ERROR_MODULE_TO_READER_METHOD = {
904
+
905
+
906
+
907
+ TrafilaturaError => :readTypeTrafilaturaError,
908
+
909
+ }
910
+
911
+ private_constant :ERROR_MODULE_TO_READER_METHOD, :CALL_SUCCESS, :CALL_ERROR, :CALL_PANIC,
912
+ :RustCallStatus
913
+
914
+ def self.consume_buffer_into_error(error_module, rust_buffer)
915
+ rust_buffer.consumeWithStream do |stream|
916
+ reader_method = ERROR_MODULE_TO_READER_METHOD[error_module]
917
+ return stream.send(reader_method)
918
+ end
919
+ end
920
+
921
+ class InternalError < StandardError
922
+ end
923
+
924
+ def self.rust_call(fn_name, *args)
925
+ # Call a rust function
926
+ rust_call_with_error(nil, fn_name, *args)
927
+ end
928
+
929
+ def self.rust_call_with_error(error_module, fn_name, *args)
930
+ # Call a rust function and handle errors
931
+ #
932
+ # Use this when the rust function returns a Result<>. error_module must be the error_module that corresponds to that Result.
933
+
934
+
935
+ # Note: RustCallStatus.new zeroes out the struct, which is exactly what we
936
+ # want to pass to Rust (code=0, error_buf=RustBuffer(len=0, capacity=0,
937
+ # data=NULL))
938
+ status = RustCallStatus.new
939
+ args << status
940
+
941
+ result = UniFFILib.public_send(fn_name, *args)
942
+
943
+ case status.code
944
+ when CALL_SUCCESS
945
+ result
946
+ when CALL_ERROR
947
+ if error_module.nil?
948
+ status.error_buf.free
949
+ raise InternalError, "CALL_ERROR with no error_module set"
950
+ else
951
+ raise consume_buffer_into_error(error_module, status.error_buf)
952
+ end
953
+ when CALL_PANIC
954
+ # When the rust code sees a panic, it tries to construct a RustBuffer
955
+ # with the message. But if that code panics, then it just sends back
956
+ # an empty buffer.
957
+ if status.error_buf.len > 0
958
+ raise InternalError, status.error_buf.consumeIntoString()
959
+ else
960
+ raise InternalError, "Rust panic"
961
+ end
962
+ else
963
+ raise InternalError, "Unknown call status: #{status.code}"
964
+ end
965
+ end
966
+
967
+ private_class_method :consume_buffer_into_error
968
+
969
+ # This is how we find and load the dynamic library provided by the component.
970
+ # For now we just look it up by name.
971
+ module UniFFILib
972
+ extend FFI::Library
973
+
974
+
975
+ ffi_lib File.join(__dir__, 'libtrafilatura_uniffi.so')
976
+
977
+
978
+ attach_function :uniffi_trafilatura_uniffi_fn_func_create_readable_document,
979
+ [RustBuffer.by_value, RustCallStatus.by_ref],
980
+ RustBuffer.by_value
981
+ attach_function :uniffi_trafilatura_uniffi_fn_func_default_config,
982
+ [RustCallStatus.by_ref],
983
+ RustBuffer.by_value
984
+ attach_function :uniffi_trafilatura_uniffi_fn_func_default_options,
985
+ [RustCallStatus.by_ref],
986
+ RustBuffer.by_value
987
+ attach_function :uniffi_trafilatura_uniffi_fn_func_extract,
988
+ [RustBuffer.by_value, RustBuffer.by_value, RustCallStatus.by_ref],
989
+ RustBuffer.by_value
990
+ attach_function :uniffi_trafilatura_uniffi_fn_func_extract_simple,
991
+ [RustBuffer.by_value, RustCallStatus.by_ref],
992
+ RustBuffer.by_value
993
+ attach_function :ffi_trafilatura_uniffi_rustbuffer_alloc,
994
+ [:uint64, RustCallStatus.by_ref],
995
+ RustBuffer.by_value
996
+ attach_function :ffi_trafilatura_uniffi_rustbuffer_from_bytes,
997
+ [ForeignBytes, RustCallStatus.by_ref],
998
+ RustBuffer.by_value
999
+ attach_function :ffi_trafilatura_uniffi_rustbuffer_free,
1000
+ [RustBuffer.by_value, RustCallStatus.by_ref],
1001
+ :void
1002
+ attach_function :ffi_trafilatura_uniffi_rustbuffer_reserve,
1003
+ [RustBuffer.by_value, :uint64, RustCallStatus.by_ref],
1004
+ RustBuffer.by_value
1005
+ attach_function :uniffi_trafilatura_uniffi_checksum_func_create_readable_document,
1006
+ [RustCallStatus.by_ref],
1007
+ :uint16
1008
+ attach_function :uniffi_trafilatura_uniffi_checksum_func_default_config,
1009
+ [RustCallStatus.by_ref],
1010
+ :uint16
1011
+ attach_function :uniffi_trafilatura_uniffi_checksum_func_default_options,
1012
+ [RustCallStatus.by_ref],
1013
+ :uint16
1014
+ attach_function :uniffi_trafilatura_uniffi_checksum_func_extract,
1015
+ [RustCallStatus.by_ref],
1016
+ :uint16
1017
+ attach_function :uniffi_trafilatura_uniffi_checksum_func_extract_simple,
1018
+ [RustCallStatus.by_ref],
1019
+ :uint16
1020
+ attach_function :ffi_trafilatura_uniffi_uniffi_contract_version,
1021
+ [RustCallStatus.by_ref],
1022
+ :uint32
1023
+
1024
+ end
1025
+
1026
+ # Public interface members begin here.
1027
+
1028
+
1029
+
1030
+
1031
+
1032
+ class ExtractionFocus
1033
+ BALANCED = 1
1034
+ FAVOR_RECALL = 2
1035
+ FAVOR_PRECISION = 3
1036
+
1037
+ end
1038
+
1039
+
1040
+
1041
+
1042
+
1043
+
1044
+ class HtmlDateMode
1045
+ AUTOMATIC = 1
1046
+ FAST = 2
1047
+ EXTENSIVE = 3
1048
+ DISABLED = 4
1049
+
1050
+ end
1051
+
1052
+
1053
+
1054
+
1055
+ # Record type ExtractResult
1056
+ class ExtractResult
1057
+ attr_reader :content_text, :comments_text, :content_html, :comments_html, :metadata
1058
+
1059
+ def initialize(content_text:, comments_text:, content_html:, comments_html:, metadata:)
1060
+ @content_text = content_text
1061
+ @comments_text = comments_text
1062
+ @content_html = content_html
1063
+ @comments_html = comments_html
1064
+ @metadata = metadata
1065
+ end
1066
+
1067
+ def ==(other)
1068
+ if @content_text != other.content_text
1069
+ return false
1070
+ end
1071
+ if @comments_text != other.comments_text
1072
+ return false
1073
+ end
1074
+ if @content_html != other.content_html
1075
+ return false
1076
+ end
1077
+ if @comments_html != other.comments_html
1078
+ return false
1079
+ end
1080
+ if @metadata != other.metadata
1081
+ return false
1082
+ end
1083
+
1084
+ true
1085
+ end
1086
+ end
1087
+
1088
+ # Record type ExtractionConfig
1089
+ class ExtractionConfig
1090
+ attr_reader :min_extracted_size, :min_extracted_comment_size, :min_output_size, :min_output_comment_size
1091
+
1092
+ def initialize(min_extracted_size:, min_extracted_comment_size:, min_output_size:, min_output_comment_size:)
1093
+ @min_extracted_size = min_extracted_size
1094
+ @min_extracted_comment_size = min_extracted_comment_size
1095
+ @min_output_size = min_output_size
1096
+ @min_output_comment_size = min_output_comment_size
1097
+ end
1098
+
1099
+ def ==(other)
1100
+ if @min_extracted_size != other.min_extracted_size
1101
+ return false
1102
+ end
1103
+ if @min_extracted_comment_size != other.min_extracted_comment_size
1104
+ return false
1105
+ end
1106
+ if @min_output_size != other.min_output_size
1107
+ return false
1108
+ end
1109
+ if @min_output_comment_size != other.min_output_comment_size
1110
+ return false
1111
+ end
1112
+
1113
+ true
1114
+ end
1115
+ end
1116
+
1117
+ # Record type ExtractionOptions
1118
+ class ExtractionOptions
1119
+ attr_reader :config, :original_url, :target_language, :enable_fallback, :focus, :exclude_comments, :exclude_tables, :include_images, :include_links, :deduplicate, :require_essential_metadata, :max_tree_size, :prune_selector, :html_date_mode, :html_date_override
1120
+
1121
+ def initialize(config:, original_url:, target_language:, enable_fallback:, focus:, exclude_comments:, exclude_tables:, include_images:, include_links:, deduplicate:, require_essential_metadata:, max_tree_size:, prune_selector:, html_date_mode:, html_date_override:)
1122
+ @config = config
1123
+ @original_url = original_url
1124
+ @target_language = target_language
1125
+ @enable_fallback = enable_fallback
1126
+ @focus = focus
1127
+ @exclude_comments = exclude_comments
1128
+ @exclude_tables = exclude_tables
1129
+ @include_images = include_images
1130
+ @include_links = include_links
1131
+ @deduplicate = deduplicate
1132
+ @require_essential_metadata = require_essential_metadata
1133
+ @max_tree_size = max_tree_size
1134
+ @prune_selector = prune_selector
1135
+ @html_date_mode = html_date_mode
1136
+ @html_date_override = html_date_override
1137
+ end
1138
+
1139
+ def ==(other)
1140
+ if @config != other.config
1141
+ return false
1142
+ end
1143
+ if @original_url != other.original_url
1144
+ return false
1145
+ end
1146
+ if @target_language != other.target_language
1147
+ return false
1148
+ end
1149
+ if @enable_fallback != other.enable_fallback
1150
+ return false
1151
+ end
1152
+ if @focus != other.focus
1153
+ return false
1154
+ end
1155
+ if @exclude_comments != other.exclude_comments
1156
+ return false
1157
+ end
1158
+ if @exclude_tables != other.exclude_tables
1159
+ return false
1160
+ end
1161
+ if @include_images != other.include_images
1162
+ return false
1163
+ end
1164
+ if @include_links != other.include_links
1165
+ return false
1166
+ end
1167
+ if @deduplicate != other.deduplicate
1168
+ return false
1169
+ end
1170
+ if @require_essential_metadata != other.require_essential_metadata
1171
+ return false
1172
+ end
1173
+ if @max_tree_size != other.max_tree_size
1174
+ return false
1175
+ end
1176
+ if @prune_selector != other.prune_selector
1177
+ return false
1178
+ end
1179
+ if @html_date_mode != other.html_date_mode
1180
+ return false
1181
+ end
1182
+ if @html_date_override != other.html_date_override
1183
+ return false
1184
+ end
1185
+
1186
+ true
1187
+ end
1188
+ end
1189
+
1190
+ # Record type Metadata
1191
+ class Metadata
1192
+ attr_reader :title, :author, :url, :hostname, :description, :sitename, :date, :categories, :tags, :id, :fingerprint, :license, :language, :image, :page_type
1193
+
1194
+ def initialize(title:, author:, url:, hostname:, description:, sitename:, date:, categories:, tags:, id:, fingerprint:, license:, language:, image:, page_type:)
1195
+ @title = title
1196
+ @author = author
1197
+ @url = url
1198
+ @hostname = hostname
1199
+ @description = description
1200
+ @sitename = sitename
1201
+ @date = date
1202
+ @categories = categories
1203
+ @tags = tags
1204
+ @id = id
1205
+ @fingerprint = fingerprint
1206
+ @license = license
1207
+ @language = language
1208
+ @image = image
1209
+ @page_type = page_type
1210
+ end
1211
+
1212
+ def ==(other)
1213
+ if @title != other.title
1214
+ return false
1215
+ end
1216
+ if @author != other.author
1217
+ return false
1218
+ end
1219
+ if @url != other.url
1220
+ return false
1221
+ end
1222
+ if @hostname != other.hostname
1223
+ return false
1224
+ end
1225
+ if @description != other.description
1226
+ return false
1227
+ end
1228
+ if @sitename != other.sitename
1229
+ return false
1230
+ end
1231
+ if @date != other.date
1232
+ return false
1233
+ end
1234
+ if @categories != other.categories
1235
+ return false
1236
+ end
1237
+ if @tags != other.tags
1238
+ return false
1239
+ end
1240
+ if @id != other.id
1241
+ return false
1242
+ end
1243
+ if @fingerprint != other.fingerprint
1244
+ return false
1245
+ end
1246
+ if @license != other.license
1247
+ return false
1248
+ end
1249
+ if @language != other.language
1250
+ return false
1251
+ end
1252
+ if @image != other.image
1253
+ return false
1254
+ end
1255
+ if @page_type != other.page_type
1256
+ return false
1257
+ end
1258
+
1259
+ true
1260
+ end
1261
+ end
1262
+
1263
+
1264
+
1265
+
1266
+
1267
+ def self.create_readable_document(result)
1268
+ result = result
1269
+ RustBuffer.check_lower_TypeExtractResult(result)
1270
+
1271
+ result = Trafilatura.rust_call(:uniffi_trafilatura_uniffi_fn_func_create_readable_document,RustBuffer.alloc_from_TypeExtractResult(result))
1272
+ return result.consumeIntoString
1273
+ end
1274
+
1275
+
1276
+
1277
+
1278
+
1279
+ def self.default_config()
1280
+ result = Trafilatura.rust_call(:uniffi_trafilatura_uniffi_fn_func_default_config,)
1281
+ return result.consumeIntoTypeExtractionConfig
1282
+ end
1283
+
1284
+
1285
+
1286
+
1287
+
1288
+ def self.default_options()
1289
+ result = Trafilatura.rust_call(:uniffi_trafilatura_uniffi_fn_func_default_options,)
1290
+ return result.consumeIntoTypeExtractionOptions
1291
+ end
1292
+
1293
+
1294
+
1295
+
1296
+
1297
+ def self.extract(html, options)
1298
+ html = Trafilatura::uniffi_utf8(html)
1299
+
1300
+
1301
+ options = options
1302
+ RustBuffer.check_lower_TypeExtractionOptions(options)
1303
+
1304
+ result = Trafilatura.rust_call_with_error(TrafilaturaError,:uniffi_trafilatura_uniffi_fn_func_extract,RustBuffer.allocFromString(html),RustBuffer.alloc_from_TypeExtractionOptions(options))
1305
+ return result.consumeIntoTypeExtractResult
1306
+ end
1307
+
1308
+
1309
+
1310
+
1311
+
1312
+ def self.extract_simple(html)
1313
+ html = Trafilatura::uniffi_utf8(html)
1314
+
1315
+
1316
+ result = Trafilatura.rust_call_with_error(TrafilaturaError,:uniffi_trafilatura_uniffi_fn_func_extract_simple,RustBuffer.allocFromString(html))
1317
+ return result.consumeIntoTypeExtractResult
1318
+ end
1319
+
1320
+
1321
+
1322
+
1323
+
1324
+ end
1325
+