kreuzberg 4.3.5-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +543 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +260 -0
  8. data/README.md +399 -0
  9. data/Rakefile +34 -0
  10. data/Steepfile +51 -0
  11. data/examples/async_patterns.rb +283 -0
  12. data/extconf.rb +60 -0
  13. data/kreuzberg.gemspec +253 -0
  14. data/lib/kreuzberg/api_proxy.rb +125 -0
  15. data/lib/kreuzberg/cache_api.rb +67 -0
  16. data/lib/kreuzberg/cli.rb +57 -0
  17. data/lib/kreuzberg/cli_proxy.rb +118 -0
  18. data/lib/kreuzberg/config.rb +1241 -0
  19. data/lib/kreuzberg/djot_content.rb +225 -0
  20. data/lib/kreuzberg/document_structure.rb +204 -0
  21. data/lib/kreuzberg/error_context.rb +136 -0
  22. data/lib/kreuzberg/errors.rb +116 -0
  23. data/lib/kreuzberg/extraction_api.rb +329 -0
  24. data/lib/kreuzberg/mcp_proxy.rb +176 -0
  25. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
  26. data/lib/kreuzberg/post_processor_protocol.rb +15 -0
  27. data/lib/kreuzberg/result.rb +712 -0
  28. data/lib/kreuzberg/setup_lib_path.rb +99 -0
  29. data/lib/kreuzberg/types.rb +414 -0
  30. data/lib/kreuzberg/validator_protocol.rb +16 -0
  31. data/lib/kreuzberg/version.rb +5 -0
  32. data/lib/kreuzberg.rb +102 -0
  33. data/lib/kreuzberg_rb.so +0 -0
  34. data/lib/libpdfium.so +0 -0
  35. data/sig/kreuzberg/internal.rbs +184 -0
  36. data/sig/kreuzberg.rbs +1337 -0
  37. data/spec/binding/async_operations_spec.rb +473 -0
  38. data/spec/binding/batch_operations_spec.rb +677 -0
  39. data/spec/binding/batch_spec.rb +360 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +85 -0
  42. data/spec/binding/cli_spec.rb +55 -0
  43. data/spec/binding/config_result_spec.rb +377 -0
  44. data/spec/binding/config_spec.rb +419 -0
  45. data/spec/binding/config_validation_spec.rb +377 -0
  46. data/spec/binding/embeddings_spec.rb +816 -0
  47. data/spec/binding/error_handling_spec.rb +399 -0
  48. data/spec/binding/error_recovery_spec.rb +488 -0
  49. data/spec/binding/errors_spec.rb +66 -0
  50. data/spec/binding/font_config_spec.rb +220 -0
  51. data/spec/binding/images_spec.rb +732 -0
  52. data/spec/binding/keywords_extraction_spec.rb +600 -0
  53. data/spec/binding/metadata_types_spec.rb +1253 -0
  54. data/spec/binding/pages_extraction_spec.rb +550 -0
  55. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  56. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  57. data/spec/binding/plugins/validator_spec.rb +273 -0
  58. data/spec/binding/tables_spec.rb +650 -0
  59. data/spec/fixtures/config.toml +38 -0
  60. data/spec/fixtures/config.yaml +41 -0
  61. data/spec/fixtures/invalid_config.toml +3 -0
  62. data/spec/serialization_spec.rb +134 -0
  63. data/spec/smoke/package_spec.rb +177 -0
  64. data/spec/spec_helper.rb +40 -0
  65. data/spec/unit/config/chunking_config_spec.rb +213 -0
  66. data/spec/unit/config/embedding_config_spec.rb +343 -0
  67. data/spec/unit/config/extraction_config_spec.rb +434 -0
  68. data/spec/unit/config/font_config_spec.rb +285 -0
  69. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  70. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  71. data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
  72. data/spec/unit/config/keyword_config_spec.rb +229 -0
  73. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  74. data/spec/unit/config/ocr_config_spec.rb +171 -0
  75. data/spec/unit/config/output_format_spec.rb +380 -0
  76. data/spec/unit/config/page_config_spec.rb +221 -0
  77. data/spec/unit/config/pdf_config_spec.rb +267 -0
  78. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  79. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  80. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  81. data/test/metadata_types_test.rb +959 -0
  82. metadata +292 -0
@@ -0,0 +1,959 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'minitest/autorun'
4
+ require 'kreuzberg'
5
+ require 'json'
6
+ require 'tempfile'
7
+
8
+ # Comprehensive tests for Kreuzberg metadata types
9
+ # Tests verify T::Struct behavior, type safety, and integration with extraction
10
+ # rubocop:disable Metrics/ClassLength, Metrics/MethodLength, Metrics/AbcSize
11
+ class MetadataTypesTest < Minitest::Test
12
+ def test_html_metadata_structure
13
+ metadata = Kreuzberg::HtmlMetadata.new(
14
+ title: 'Test Page',
15
+ description: 'A test description',
16
+ author: 'Test Author',
17
+ copyright: '2024 Test Corp',
18
+ keywords: %w[test metadata],
19
+ canonical_url: 'https://example.com/test',
20
+ language: 'en',
21
+ text_direction: 'ltr',
22
+ mime_type: 'text/html',
23
+ charset: 'utf-8',
24
+ generator: 'Kreuzberg',
25
+ viewport: 'width=device-width, initial-scale=1',
26
+ theme_color: '#ffffff',
27
+ application_name: 'Test App',
28
+ robots: 'index, follow',
29
+ open_graph: { 'og:title' => 'Test', 'og:image' => 'image.jpg' },
30
+ twitter_card: { 'twitter:card' => 'summary' },
31
+ meta_tags: { 'custom' => 'value' },
32
+ headers: [],
33
+ links: [],
34
+ images: [],
35
+ structured_data: []
36
+ )
37
+
38
+ assert_equal 'Test Page', metadata.title
39
+ assert_equal 'A test description', metadata.description
40
+ assert_equal 'Test Author', metadata.author
41
+ assert_equal '2024 Test Corp', metadata.copyright
42
+ assert_equal 'https://example.com/test', metadata.canonical_url
43
+ assert_equal 'en', metadata.language
44
+ assert_equal 'ltr', metadata.text_direction
45
+ assert_equal 'text/html', metadata.mime_type
46
+ assert_equal 'utf-8', metadata.charset
47
+ assert_equal 'Kreuzberg', metadata.generator
48
+ assert_equal '#ffffff', metadata.theme_color
49
+ assert_equal 'Test App', metadata.application_name
50
+ assert_equal 'index, follow', metadata.robots
51
+ end
52
+
53
+ def test_keywords_is_array
54
+ keywords_array = %w[test metadata array]
55
+ metadata = Kreuzberg::HtmlMetadata.new(
56
+ title: nil,
57
+ description: nil,
58
+ author: nil,
59
+ copyright: nil,
60
+ keywords: keywords_array,
61
+ canonical_url: nil,
62
+ language: nil,
63
+ text_direction: nil,
64
+ mime_type: nil,
65
+ charset: nil,
66
+ generator: nil,
67
+ viewport: nil,
68
+ theme_color: nil,
69
+ application_name: nil,
70
+ robots: nil,
71
+ open_graph: {},
72
+ twitter_card: {},
73
+ meta_tags: {},
74
+ headers: [],
75
+ links: [],
76
+ images: [],
77
+ structured_data: []
78
+ )
79
+
80
+ assert_instance_of Array, metadata.keywords
81
+ assert_equal keywords_array, metadata.keywords
82
+ metadata.keywords.each { |keyword| assert_instance_of String, keyword }
83
+ end
84
+
85
+ def test_canonical_url_renamed
86
+ metadata = Kreuzberg::HtmlMetadata.new(
87
+ title: nil,
88
+ description: nil,
89
+ author: nil,
90
+ copyright: nil,
91
+ keywords: [],
92
+ canonical_url: 'https://example.com/canonical',
93
+ language: nil,
94
+ text_direction: nil,
95
+ mime_type: nil,
96
+ charset: nil,
97
+ generator: nil,
98
+ viewport: nil,
99
+ theme_color: nil,
100
+ application_name: nil,
101
+ robots: nil,
102
+ open_graph: {},
103
+ twitter_card: {},
104
+ meta_tags: {},
105
+ headers: [],
106
+ links: [],
107
+ images: [],
108
+ structured_data: []
109
+ )
110
+
111
+ assert_equal 'https://example.com/canonical', metadata.canonical_url
112
+ assert_respond_to metadata, :canonical_url
113
+ end
114
+
115
+ def test_open_graph_is_hash
116
+ og_tags = {
117
+ 'og:title' => 'Test Title',
118
+ 'og:description' => 'Test Description',
119
+ 'og:image' => 'https://example.com/image.jpg',
120
+ 'og:url' => 'https://example.com'
121
+ }
122
+ metadata = Kreuzberg::HtmlMetadata.new(
123
+ title: nil,
124
+ description: nil,
125
+ author: nil,
126
+ copyright: nil,
127
+ keywords: [],
128
+ canonical_url: nil,
129
+ language: nil,
130
+ text_direction: nil,
131
+ mime_type: nil,
132
+ charset: nil,
133
+ generator: nil,
134
+ viewport: nil,
135
+ theme_color: nil,
136
+ application_name: nil,
137
+ robots: nil,
138
+ open_graph: og_tags,
139
+ twitter_card: {},
140
+ meta_tags: {},
141
+ headers: [],
142
+ links: [],
143
+ images: [],
144
+ structured_data: []
145
+ )
146
+
147
+ assert_instance_of Hash, metadata.open_graph
148
+ assert_equal og_tags, metadata.open_graph
149
+ metadata.open_graph.each do |key, value|
150
+ assert_instance_of String, key
151
+ assert_instance_of String, value
152
+ end
153
+ end
154
+
155
+ def test_twitter_card_is_hash
156
+ twitter_tags = {
157
+ 'twitter:card' => 'summary_large_image',
158
+ 'twitter:title' => 'Test',
159
+ 'twitter:description' => 'Description',
160
+ 'twitter:image' => 'https://example.com/image.jpg'
161
+ }
162
+ metadata = Kreuzberg::HtmlMetadata.new(
163
+ title: nil,
164
+ description: nil,
165
+ author: nil,
166
+ copyright: nil,
167
+ keywords: [],
168
+ canonical_url: nil,
169
+ language: nil,
170
+ text_direction: nil,
171
+ mime_type: nil,
172
+ charset: nil,
173
+ generator: nil,
174
+ viewport: nil,
175
+ theme_color: nil,
176
+ application_name: nil,
177
+ robots: nil,
178
+ open_graph: {},
179
+ twitter_card: twitter_tags,
180
+ meta_tags: {},
181
+ headers: [],
182
+ links: [],
183
+ images: [],
184
+ structured_data: []
185
+ )
186
+
187
+ assert_instance_of Hash, metadata.twitter_card
188
+ assert_equal twitter_tags, metadata.twitter_card
189
+ metadata.twitter_card.each do |key, value|
190
+ assert_instance_of String, key
191
+ assert_instance_of String, value
192
+ end
193
+ end
194
+
195
+ # ============================================================================
196
+ # T::Struct Behavior Tests
197
+ # ============================================================================
198
+
199
+ def test_header_metadata_creation
200
+ header = Kreuzberg::HeaderMetadata.new(
201
+ level: 1,
202
+ text: 'Main Title',
203
+ id: 'main-title',
204
+ depth: 0,
205
+ html_offset: 245
206
+ )
207
+
208
+ assert_equal 1, header.level
209
+ assert_equal 'Main Title', header.text
210
+ assert_equal 'main-title', header.id
211
+ assert_equal 0, header.depth
212
+ assert_equal 245, header.html_offset
213
+ end
214
+
215
+ def test_header_metadata_nil_id
216
+ header = Kreuzberg::HeaderMetadata.new(
217
+ level: 2,
218
+ text: 'Subtitle',
219
+ id: nil,
220
+ depth: 1,
221
+ html_offset: 456
222
+ )
223
+
224
+ assert_equal 2, header.level
225
+ assert_equal 'Subtitle', header.text
226
+ assert_nil header.id
227
+ assert_equal 1, header.depth
228
+ assert_equal 456, header.html_offset
229
+ end
230
+
231
+ def test_link_metadata_creation
232
+ link = Kreuzberg::LinkMetadata.new(
233
+ href: 'https://example.com',
234
+ text: 'Example',
235
+ title: 'Example Site',
236
+ link_type: 'external',
237
+ rel: %w[noopener noreferrer],
238
+ attributes: { 'data-id' => '123', 'class' => 'external-link' }
239
+ )
240
+
241
+ assert_equal 'https://example.com', link.href
242
+ assert_equal 'Example', link.text
243
+ assert_equal 'Example Site', link.title
244
+ assert_equal 'external', link.link_type
245
+ assert_instance_of Array, link.rel
246
+ assert_equal %w[noopener noreferrer], link.rel
247
+ assert_instance_of Hash, link.attributes
248
+ assert_equal '123', link.attributes['data-id']
249
+ assert_equal 'external-link', link.attributes['class']
250
+ end
251
+
252
+ def test_link_metadata_empty_arrays_and_hashes
253
+ link = Kreuzberg::LinkMetadata.new(
254
+ href: 'https://example.com',
255
+ text: 'Link',
256
+ title: nil,
257
+ link_type: 'internal',
258
+ rel: [],
259
+ attributes: {}
260
+ )
261
+
262
+ assert_equal 'https://example.com', link.href
263
+ assert_empty link.rel
264
+ assert_empty link.attributes
265
+ assert_nil link.title
266
+ end
267
+
268
+ def test_image_metadata_creation
269
+ image = Kreuzberg::ImageMetadata.new(
270
+ src: 'images/logo.png',
271
+ alt: 'Company Logo',
272
+ title: nil,
273
+ dimensions: [200, 100],
274
+ image_type: 'png',
275
+ attributes: { 'loading' => 'lazy', 'class' => 'logo' }
276
+ )
277
+
278
+ assert_equal 'images/logo.png', image.src
279
+ assert_equal 'Company Logo', image.alt
280
+ assert_nil image.title
281
+ assert_instance_of Array, image.dimensions
282
+ assert_equal [200, 100], image.dimensions
283
+ assert_equal 'png', image.image_type
284
+ assert_instance_of Hash, image.attributes
285
+ assert_equal 'lazy', image.attributes['loading']
286
+ end
287
+
288
+ def test_image_metadata_nil_dimensions
289
+ image = Kreuzberg::ImageMetadata.new(
290
+ src: 'image.jpg',
291
+ alt: 'Description',
292
+ title: 'Title',
293
+ dimensions: nil,
294
+ image_type: 'jpg',
295
+ attributes: {}
296
+ )
297
+
298
+ assert_equal 'image.jpg', image.src
299
+ assert_nil image.dimensions
300
+ assert_equal 'jpg', image.image_type
301
+ end
302
+
303
+ def test_structured_data_creation
304
+ json_data = '{"@context":"https://schema.org","@type":"Article","headline":"Test Article"}'
305
+ structured = Kreuzberg::StructuredData.new(
306
+ data_type: 'json-ld',
307
+ raw_json: json_data,
308
+ schema_type: 'Article'
309
+ )
310
+
311
+ assert_equal 'json-ld', structured.data_type
312
+ assert_equal json_data, structured.raw_json
313
+ assert_equal 'Article', structured.schema_type
314
+ parsed = JSON.parse(structured.raw_json)
315
+ assert_equal 'Article', parsed['@type']
316
+ end
317
+
318
+ def test_structured_data_nil_schema_type
319
+ json_data = '{"data":"value"}'
320
+ structured = Kreuzberg::StructuredData.new(
321
+ data_type: 'microdata',
322
+ raw_json: json_data,
323
+ schema_type: nil
324
+ )
325
+
326
+ assert_equal 'microdata', structured.data_type
327
+ assert_nil structured.schema_type
328
+ end
329
+
330
+ # ============================================================================
331
+ # Integration Tests
332
+ # ============================================================================
333
+
334
+ def test_extract_html_returns_metadata
335
+ html_file = create_test_html_file(
336
+ '<html><head><title>Test Page</title></head><body><p>Content</p></body></html>'
337
+ )
338
+
339
+ begin
340
+ result = Kreuzberg.extract_file_sync(html_file)
341
+ assert_instance_of Kreuzberg::Result, result
342
+ assert_not_nil result.metadata
343
+
344
+ if result.metadata.is_a?(Hash)
345
+ assert result.metadata.is_a?(Hash)
346
+ elsif result.metadata.is_a?(Kreuzberg::HtmlMetadata)
347
+ assert result.metadata.is_a?(Kreuzberg::HtmlMetadata)
348
+ end
349
+ ensure
350
+ FileUtils.rm_f(html_file)
351
+ end
352
+ end
353
+
354
+ def test_metadata_keywords_array
355
+ html_content = <<~HTML
356
+ <html>
357
+ <head>
358
+ <title>Test</title>
359
+ <meta name="keywords" content="ruby, testing, metadata">
360
+ </head>
361
+ <body></body>
362
+ </html>
363
+ HTML
364
+ html_file = create_test_html_file(html_content)
365
+
366
+ begin
367
+ result = Kreuzberg.extract_file_sync(html_file)
368
+ metadata = result.metadata
369
+
370
+ if metadata.is_a?(Hash) && metadata['keywords']
371
+ assert metadata['keywords'].is_a?(Array)
372
+ elsif metadata.is_a?(Kreuzberg::HtmlMetadata)
373
+ assert_instance_of Array, metadata.keywords
374
+ end
375
+ ensure
376
+ FileUtils.rm_f(html_file)
377
+ end
378
+ end
379
+
380
+ def test_metadata_open_graph_hash
381
+ html_content = <<~HTML
382
+ <html>
383
+ <head>
384
+ <title>Test</title>
385
+ <meta property="og:title" content="Test Title">
386
+ <meta property="og:description" content="Test Description">
387
+ <meta property="og:image" content="https://example.com/image.jpg">
388
+ </head>
389
+ <body></body>
390
+ </html>
391
+ HTML
392
+ html_file = create_test_html_file(html_content)
393
+
394
+ begin
395
+ result = Kreuzberg.extract_file_sync(html_file)
396
+ metadata = result.metadata
397
+
398
+ if metadata.is_a?(Hash) && metadata['open_graph']
399
+ assert metadata['open_graph'].is_a?(Hash)
400
+ elsif metadata.is_a?(Kreuzberg::HtmlMetadata)
401
+ assert_instance_of Hash, metadata.open_graph
402
+ end
403
+ ensure
404
+ FileUtils.rm_f(html_file)
405
+ end
406
+ end
407
+
408
+ def test_metadata_headers_array
409
+ html_content = <<~HTML
410
+ <html>
411
+ <head><title>Test</title></head>
412
+ <body>
413
+ <h1>Main Title</h1>
414
+ <h2>Subtitle</h2>
415
+ <h3 id="section-1">Section 1</h3>
416
+ </body>
417
+ </html>
418
+ HTML
419
+ html_file = create_test_html_file(html_content)
420
+
421
+ begin
422
+ result = Kreuzberg.extract_file_sync(html_file)
423
+ metadata = result.metadata
424
+
425
+ if metadata.is_a?(Hash) && metadata['headers']
426
+ assert metadata['headers'].is_a?(Array)
427
+ elsif metadata.is_a?(Kreuzberg::HtmlMetadata)
428
+ assert_instance_of Array, metadata.headers
429
+ end
430
+ ensure
431
+ FileUtils.rm_f(html_file)
432
+ end
433
+ end
434
+
435
+ def test_metadata_links_array
436
+ html_content = <<~HTML
437
+ <html>
438
+ <head><title>Test</title></head>
439
+ <body>
440
+ <a href="https://example.com">External Link</a>
441
+ <a href="/page">Internal Link</a>
442
+ <a href="#section">Anchor Link</a>
443
+ </body>
444
+ </html>
445
+ HTML
446
+ html_file = create_test_html_file(html_content)
447
+
448
+ begin
449
+ result = Kreuzberg.extract_file_sync(html_file)
450
+ metadata = result.metadata
451
+
452
+ if metadata.is_a?(Hash) && metadata['links']
453
+ assert metadata['links'].is_a?(Array)
454
+ elsif metadata.is_a?(Kreuzberg::HtmlMetadata)
455
+ assert_instance_of Array, metadata.links
456
+ end
457
+ ensure
458
+ FileUtils.rm_f(html_file)
459
+ end
460
+ end
461
+
462
+ def test_metadata_images_array
463
+ html_content = <<~HTML
464
+ <html>
465
+ <head><title>Test</title></head>
466
+ <body>
467
+ <img src="image1.jpg" alt="Image 1" width="200" height="100">
468
+ <img src="image2.png" alt="Image 2">
469
+ <img src="image3.gif">
470
+ </body>
471
+ </html>
472
+ HTML
473
+ html_file = create_test_html_file(html_content)
474
+
475
+ begin
476
+ result = Kreuzberg.extract_file_sync(html_file)
477
+ metadata = result.metadata
478
+
479
+ if metadata.is_a?(Hash) && metadata['images']
480
+ assert metadata['images'].is_a?(Array)
481
+ elsif metadata.is_a?(Kreuzberg::HtmlMetadata)
482
+ assert_instance_of Array, metadata.images
483
+ end
484
+ ensure
485
+ FileUtils.rm_f(html_file)
486
+ end
487
+ end
488
+
489
+ # ============================================================================
490
+ # Edge Cases
491
+ # ============================================================================
492
+
493
+ def test_metadata_empty_html
494
+ html_file = create_test_html_file('<html><body></body></html>')
495
+
496
+ begin
497
+ result = Kreuzberg.extract_file_sync(html_file)
498
+ metadata = result.metadata
499
+
500
+ if metadata.is_a?(Kreuzberg::HtmlMetadata)
501
+ assert_instance_of Array, metadata.keywords
502
+ assert_instance_of Hash, metadata.open_graph
503
+ assert_instance_of Hash, metadata.twitter_card
504
+ assert_instance_of Hash, metadata.meta_tags
505
+ assert_instance_of Array, metadata.headers
506
+ assert_instance_of Array, metadata.links
507
+ assert_instance_of Array, metadata.images
508
+ assert_instance_of Array, metadata.structured_data
509
+ elsif metadata.is_a?(Hash)
510
+ assert_instance_of Array, metadata['keywords'] || []
511
+ assert_instance_of Hash, metadata['open_graph'] || {}
512
+ assert_instance_of Hash, metadata['twitter_card'] || {}
513
+ end
514
+ ensure
515
+ FileUtils.rm_f(html_file)
516
+ end
517
+ end
518
+
519
+ def test_metadata_nil_optional_fields
520
+ metadata = Kreuzberg::HtmlMetadata.new(
521
+ title: nil,
522
+ description: nil,
523
+ author: nil,
524
+ copyright: nil,
525
+ keywords: [],
526
+ canonical_url: nil,
527
+ language: nil,
528
+ text_direction: nil,
529
+ mime_type: nil,
530
+ charset: nil,
531
+ generator: nil,
532
+ viewport: nil,
533
+ theme_color: nil,
534
+ application_name: nil,
535
+ robots: nil,
536
+ open_graph: {},
537
+ twitter_card: {},
538
+ meta_tags: {},
539
+ headers: [],
540
+ links: [],
541
+ images: [],
542
+ structured_data: []
543
+ )
544
+
545
+ assert_nil metadata.title
546
+ assert_nil metadata.description
547
+ assert_nil metadata.author
548
+ assert_nil metadata.copyright
549
+ assert_nil metadata.canonical_url
550
+ assert_nil metadata.language
551
+ assert_nil metadata.text_direction
552
+ assert_nil metadata.mime_type
553
+ assert_nil metadata.charset
554
+ assert_nil metadata.generator
555
+ assert_nil metadata.viewport
556
+ assert_nil metadata.theme_color
557
+ assert_nil metadata.application_name
558
+ assert_nil metadata.robots
559
+ end
560
+
561
+ def test_metadata_empty_collections
562
+ metadata = Kreuzberg::HtmlMetadata.new(
563
+ title: nil,
564
+ description: nil,
565
+ author: nil,
566
+ copyright: nil,
567
+ keywords: [],
568
+ canonical_url: nil,
569
+ language: nil,
570
+ text_direction: nil,
571
+ mime_type: nil,
572
+ charset: nil,
573
+ generator: nil,
574
+ viewport: nil,
575
+ theme_color: nil,
576
+ application_name: nil,
577
+ robots: nil,
578
+ open_graph: {},
579
+ twitter_card: {},
580
+ meta_tags: {},
581
+ headers: [],
582
+ links: [],
583
+ images: [],
584
+ structured_data: []
585
+ )
586
+
587
+ assert_empty metadata.keywords
588
+ assert_empty metadata.open_graph
589
+ assert_empty metadata.twitter_card
590
+ assert_empty metadata.meta_tags
591
+ assert_empty metadata.headers
592
+ assert_empty metadata.links
593
+ assert_empty metadata.images
594
+ assert_empty metadata.structured_data
595
+ end
596
+
597
+ # ============================================================================
598
+ # Sorbet Type Safety
599
+ # ============================================================================
600
+
601
+ def test_type_checking_enabled
602
+ metadata = Kreuzberg::HtmlMetadata.new(
603
+ title: 'Test',
604
+ description: nil,
605
+ author: nil,
606
+ copyright: nil,
607
+ keywords: ['test'],
608
+ canonical_url: nil,
609
+ language: nil,
610
+ text_direction: nil,
611
+ mime_type: nil,
612
+ charset: nil,
613
+ generator: nil,
614
+ viewport: nil,
615
+ theme_color: nil,
616
+ application_name: nil,
617
+ robots: nil,
618
+ open_graph: {},
619
+ twitter_card: {},
620
+ meta_tags: {},
621
+ headers: [],
622
+ links: [],
623
+ images: [],
624
+ structured_data: []
625
+ )
626
+
627
+ assert_kind_of Kreuzberg::HtmlMetadata, metadata
628
+ assert metadata.respond_to?(:title)
629
+ assert metadata.respond_to?(:keywords)
630
+ assert metadata.respond_to?(:open_graph)
631
+ end
632
+
633
+ def test_immutable_tstruct_fields
634
+ metadata = Kreuzberg::HtmlMetadata.new(
635
+ title: 'Original',
636
+ description: nil,
637
+ author: nil,
638
+ copyright: nil,
639
+ keywords: [],
640
+ canonical_url: nil,
641
+ language: nil,
642
+ text_direction: nil,
643
+ mime_type: nil,
644
+ charset: nil,
645
+ generator: nil,
646
+ viewport: nil,
647
+ theme_color: nil,
648
+ application_name: nil,
649
+ robots: nil,
650
+ open_graph: {},
651
+ twitter_card: {},
652
+ meta_tags: {},
653
+ headers: [],
654
+ links: [],
655
+ images: [],
656
+ structured_data: []
657
+ )
658
+
659
+ assert_raises(NoMethodError) { metadata.title = 'Modified' }
660
+ end
661
+
662
+ def test_headers_with_multiple_levels
663
+ headers = [
664
+ Kreuzberg::HeaderMetadata.new(level: 1, text: 'H1', id: nil, depth: 0, html_offset: 0),
665
+ Kreuzberg::HeaderMetadata.new(level: 2, text: 'H2', id: nil, depth: 1, html_offset: 50),
666
+ Kreuzberg::HeaderMetadata.new(level: 3, text: 'H3', id: 'sec-1', depth: 2, html_offset: 100),
667
+ Kreuzberg::HeaderMetadata.new(level: 2, text: 'H2-2', id: nil, depth: 1, html_offset: 150)
668
+ ]
669
+
670
+ metadata = Kreuzberg::HtmlMetadata.new(
671
+ title: nil,
672
+ description: nil,
673
+ author: nil,
674
+ copyright: nil,
675
+ keywords: [],
676
+ canonical_url: nil,
677
+ language: nil,
678
+ text_direction: nil,
679
+ mime_type: nil,
680
+ charset: nil,
681
+ generator: nil,
682
+ viewport: nil,
683
+ theme_color: nil,
684
+ application_name: nil,
685
+ robots: nil,
686
+ open_graph: {},
687
+ twitter_card: {},
688
+ meta_tags: {},
689
+ headers: headers,
690
+ links: [],
691
+ images: [],
692
+ structured_data: []
693
+ )
694
+
695
+ assert_equal 4, metadata.headers.length
696
+ assert_equal 1, metadata.headers[0].level
697
+ assert_equal 3, metadata.headers[2].level
698
+ assert_equal 'sec-1', metadata.headers[2].id
699
+ end
700
+
701
+ def test_links_with_various_types
702
+ links = [
703
+ Kreuzberg::LinkMetadata.new(
704
+ href: 'https://external.com',
705
+ text: 'External',
706
+ title: nil,
707
+ link_type: 'external',
708
+ rel: ['noopener'],
709
+ attributes: {}
710
+ ),
711
+ Kreuzberg::LinkMetadata.new(
712
+ href: '/internal/page',
713
+ text: 'Internal',
714
+ title: 'Internal Page',
715
+ link_type: 'internal',
716
+ rel: [],
717
+ attributes: { 'class' => 'nav-link' }
718
+ ),
719
+ Kreuzberg::LinkMetadata.new(
720
+ href: '#section',
721
+ text: 'Anchor',
722
+ title: nil,
723
+ link_type: 'anchor',
724
+ rel: [],
725
+ attributes: {}
726
+ )
727
+ ]
728
+
729
+ metadata = Kreuzberg::HtmlMetadata.new(
730
+ title: nil,
731
+ description: nil,
732
+ author: nil,
733
+ copyright: nil,
734
+ keywords: [],
735
+ canonical_url: nil,
736
+ language: nil,
737
+ text_direction: nil,
738
+ mime_type: nil,
739
+ charset: nil,
740
+ generator: nil,
741
+ viewport: nil,
742
+ theme_color: nil,
743
+ application_name: nil,
744
+ robots: nil,
745
+ open_graph: {},
746
+ twitter_card: {},
747
+ meta_tags: {},
748
+ headers: [],
749
+ links: links,
750
+ images: [],
751
+ structured_data: []
752
+ )
753
+
754
+ assert_equal 3, metadata.links.length
755
+ assert_equal 'external', metadata.links[0].link_type
756
+ assert_equal 'internal', metadata.links[1].link_type
757
+ assert_equal 'anchor', metadata.links[2].link_type
758
+ assert_equal 'nav-link', metadata.links[1].attributes['class']
759
+ end
760
+
761
+ def test_images_with_attributes
762
+ images = [
763
+ Kreuzberg::ImageMetadata.new(
764
+ src: 'logo.png',
765
+ alt: 'Logo',
766
+ title: nil,
767
+ dimensions: [200, 100],
768
+ image_type: 'png',
769
+ attributes: { 'class' => 'logo', 'loading' => 'eager' }
770
+ ),
771
+ Kreuzberg::ImageMetadata.new(
772
+ src: 'thumbnail.jpg',
773
+ alt: nil,
774
+ title: 'Thumbnail',
775
+ dimensions: nil,
776
+ image_type: 'jpg',
777
+ attributes: { 'loading' => 'lazy', 'decoding' => 'async' }
778
+ )
779
+ ]
780
+
781
+ metadata = Kreuzberg::HtmlMetadata.new(
782
+ title: nil,
783
+ description: nil,
784
+ author: nil,
785
+ copyright: nil,
786
+ keywords: [],
787
+ canonical_url: nil,
788
+ language: nil,
789
+ text_direction: nil,
790
+ mime_type: nil,
791
+ charset: nil,
792
+ generator: nil,
793
+ viewport: nil,
794
+ theme_color: nil,
795
+ application_name: nil,
796
+ robots: nil,
797
+ open_graph: {},
798
+ twitter_card: {},
799
+ meta_tags: {},
800
+ headers: [],
801
+ links: [],
802
+ images: images,
803
+ structured_data: []
804
+ )
805
+
806
+ assert_equal 2, metadata.images.length
807
+ assert_equal [200, 100], metadata.images[0].dimensions
808
+ assert_nil metadata.images[1].dimensions
809
+ assert_equal 'lazy', metadata.images[1].attributes['loading']
810
+ end
811
+
812
+ def test_structured_data_multiple_types
813
+ json_ld = '{"@context":"https://schema.org","@type":"Article"}'
814
+ microdata = '{"type":"http://schema.org/Person"}'
815
+
816
+ structured_data = [
817
+ Kreuzberg::StructuredData.new(
818
+ data_type: 'json-ld',
819
+ raw_json: json_ld,
820
+ schema_type: 'Article'
821
+ ),
822
+ Kreuzberg::StructuredData.new(
823
+ data_type: 'microdata',
824
+ raw_json: microdata,
825
+ schema_type: 'Person'
826
+ ),
827
+ Kreuzberg::StructuredData.new(
828
+ data_type: 'json-ld',
829
+ raw_json: '{"@type":"Organization"}',
830
+ schema_type: nil
831
+ )
832
+ ]
833
+
834
+ metadata = Kreuzberg::HtmlMetadata.new(
835
+ title: nil,
836
+ description: nil,
837
+ author: nil,
838
+ copyright: nil,
839
+ keywords: [],
840
+ canonical_url: nil,
841
+ language: nil,
842
+ text_direction: nil,
843
+ mime_type: nil,
844
+ charset: nil,
845
+ generator: nil,
846
+ viewport: nil,
847
+ theme_color: nil,
848
+ application_name: nil,
849
+ robots: nil,
850
+ open_graph: {},
851
+ twitter_card: {},
852
+ meta_tags: {},
853
+ headers: [],
854
+ links: [],
855
+ images: [],
856
+ structured_data: structured_data
857
+ )
858
+
859
+ assert_equal 3, metadata.structured_data.length
860
+ assert_equal 'json-ld', metadata.structured_data[0].data_type
861
+ assert_equal 'Article', metadata.structured_data[0].schema_type
862
+ assert_equal 'microdata', metadata.structured_data[1].data_type
863
+ assert_nil metadata.structured_data[2].schema_type
864
+ end
865
+
866
+ def test_html_metadata_with_all_fields_populated
867
+ headers = [
868
+ Kreuzberg::HeaderMetadata.new(level: 1, text: 'Title', id: 'title', depth: 0, html_offset: 100)
869
+ ]
870
+ links = [
871
+ Kreuzberg::LinkMetadata.new(
872
+ href: 'https://example.com',
873
+ text: 'Example',
874
+ title: 'Example Site',
875
+ link_type: 'external',
876
+ rel: ['noopener'],
877
+ attributes: { 'data-track' => 'true' }
878
+ )
879
+ ]
880
+ images = [
881
+ Kreuzberg::ImageMetadata.new(
882
+ src: 'image.jpg',
883
+ alt: 'Test Image',
884
+ title: nil,
885
+ dimensions: [300, 200],
886
+ image_type: 'jpg',
887
+ attributes: { 'loading' => 'lazy' }
888
+ )
889
+ ]
890
+ structured = [
891
+ Kreuzberg::StructuredData.new(
892
+ data_type: 'json-ld',
893
+ raw_json: '{"@type":"WebPage"}',
894
+ schema_type: 'WebPage'
895
+ )
896
+ ]
897
+
898
+ metadata = Kreuzberg::HtmlMetadata.new(
899
+ title: 'Complete Test Page',
900
+ description: 'A complete test page with all metadata',
901
+ author: 'Test Author',
902
+ copyright: '2024 Test Corp',
903
+ keywords: %w[test comprehensive metadata],
904
+ canonical_url: 'https://example.com/test',
905
+ language: 'en',
906
+ text_direction: 'ltr',
907
+ mime_type: 'text/html; charset=utf-8',
908
+ charset: 'utf-8',
909
+ generator: 'Kreuzberg',
910
+ viewport: 'width=device-width, initial-scale=1',
911
+ theme_color: '#ffffff',
912
+ application_name: 'Test App',
913
+ robots: 'index, follow',
914
+ open_graph: {
915
+ 'og:title' => 'Test',
916
+ 'og:description' => 'Description',
917
+ 'og:image' => 'https://example.com/image.jpg'
918
+ },
919
+ twitter_card: {
920
+ 'twitter:card' => 'summary_large_image',
921
+ 'twitter:title' => 'Test'
922
+ },
923
+ meta_tags: {
924
+ 'custom-tag' => 'custom-value'
925
+ },
926
+ headers: headers,
927
+ links: links,
928
+ images: images,
929
+ structured_data: structured
930
+ )
931
+
932
+ assert_equal 'Complete Test Page', metadata.title
933
+ assert_equal 'A complete test page with all metadata', metadata.description
934
+ assert_equal 'Test Author', metadata.author
935
+ assert_equal '2024 Test Corp', metadata.copyright
936
+ assert_equal 3, metadata.keywords.length
937
+ assert_equal 'https://example.com/test', metadata.canonical_url
938
+ assert_equal 'en', metadata.language
939
+ assert_equal 'ltr', metadata.text_direction
940
+ assert_equal 'Kreuzberg', metadata.generator
941
+ assert_equal 3, metadata.open_graph.length
942
+ assert_equal 2, metadata.twitter_card.length
943
+ assert_equal 1, metadata.meta_tags.length
944
+ assert_equal 1, metadata.headers.length
945
+ assert_equal 1, metadata.links.length
946
+ assert_equal 1, metadata.images.length
947
+ assert_equal 1, metadata.structured_data.length
948
+ end
949
+
950
+ private
951
+
952
+ def create_test_html_file(content)
953
+ file = Tempfile.new(['test', '.html'])
954
+ file.write(content)
955
+ file.close
956
+ file.path
957
+ end
958
+ end
959
+ # rubocop:enable Metrics/ClassLength, Metrics/MethodLength, Metrics/AbcSize