kreuzberg 4.3.5-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +1 -0
- data/.rubocop.yml +543 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +260 -0
- data/README.md +399 -0
- data/Rakefile +34 -0
- data/Steepfile +51 -0
- data/examples/async_patterns.rb +283 -0
- data/extconf.rb +60 -0
- data/kreuzberg.gemspec +253 -0
- data/lib/kreuzberg/api_proxy.rb +125 -0
- data/lib/kreuzberg/cache_api.rb +67 -0
- data/lib/kreuzberg/cli.rb +57 -0
- data/lib/kreuzberg/cli_proxy.rb +118 -0
- data/lib/kreuzberg/config.rb +1241 -0
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/document_structure.rb +204 -0
- data/lib/kreuzberg/error_context.rb +136 -0
- data/lib/kreuzberg/errors.rb +116 -0
- data/lib/kreuzberg/extraction_api.rb +329 -0
- data/lib/kreuzberg/mcp_proxy.rb +176 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
- data/lib/kreuzberg/post_processor_protocol.rb +15 -0
- data/lib/kreuzberg/result.rb +712 -0
- data/lib/kreuzberg/setup_lib_path.rb +99 -0
- data/lib/kreuzberg/types.rb +414 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +102 -0
- data/lib/kreuzberg_rb.so +0 -0
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +1337 -0
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +677 -0
- data/spec/binding/batch_spec.rb +360 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +85 -0
- data/spec/binding/cli_spec.rb +55 -0
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -0
- data/spec/binding/config_validation_spec.rb +377 -0
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -0
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +732 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1253 -0
- data/spec/binding/pages_extraction_spec.rb +550 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +273 -0
- data/spec/binding/tables_spec.rb +650 -0
- data/spec/fixtures/config.toml +38 -0
- data/spec/fixtures/config.yaml +41 -0
- data/spec/fixtures/invalid_config.toml +3 -0
- data/spec/serialization_spec.rb +134 -0
- data/spec/smoke/package_spec.rb +177 -0
- data/spec/spec_helper.rb +40 -0
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +434 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- metadata +292 -0
|
@@ -0,0 +1,959 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'minitest/autorun'
|
|
4
|
+
require 'kreuzberg'
|
|
5
|
+
require 'json'
|
|
6
|
+
require 'tempfile'
|
|
7
|
+
|
|
8
|
+
# Comprehensive tests for Kreuzberg metadata types
|
|
9
|
+
# Tests verify T::Struct behavior, type safety, and integration with extraction
|
|
10
|
+
# rubocop:disable Metrics/ClassLength, Metrics/MethodLength, Metrics/AbcSize
|
|
11
|
+
class MetadataTypesTest < Minitest::Test
|
|
12
|
+
def test_html_metadata_structure
|
|
13
|
+
metadata = Kreuzberg::HtmlMetadata.new(
|
|
14
|
+
title: 'Test Page',
|
|
15
|
+
description: 'A test description',
|
|
16
|
+
author: 'Test Author',
|
|
17
|
+
copyright: '2024 Test Corp',
|
|
18
|
+
keywords: %w[test metadata],
|
|
19
|
+
canonical_url: 'https://example.com/test',
|
|
20
|
+
language: 'en',
|
|
21
|
+
text_direction: 'ltr',
|
|
22
|
+
mime_type: 'text/html',
|
|
23
|
+
charset: 'utf-8',
|
|
24
|
+
generator: 'Kreuzberg',
|
|
25
|
+
viewport: 'width=device-width, initial-scale=1',
|
|
26
|
+
theme_color: '#ffffff',
|
|
27
|
+
application_name: 'Test App',
|
|
28
|
+
robots: 'index, follow',
|
|
29
|
+
open_graph: { 'og:title' => 'Test', 'og:image' => 'image.jpg' },
|
|
30
|
+
twitter_card: { 'twitter:card' => 'summary' },
|
|
31
|
+
meta_tags: { 'custom' => 'value' },
|
|
32
|
+
headers: [],
|
|
33
|
+
links: [],
|
|
34
|
+
images: [],
|
|
35
|
+
structured_data: []
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
assert_equal 'Test Page', metadata.title
|
|
39
|
+
assert_equal 'A test description', metadata.description
|
|
40
|
+
assert_equal 'Test Author', metadata.author
|
|
41
|
+
assert_equal '2024 Test Corp', metadata.copyright
|
|
42
|
+
assert_equal 'https://example.com/test', metadata.canonical_url
|
|
43
|
+
assert_equal 'en', metadata.language
|
|
44
|
+
assert_equal 'ltr', metadata.text_direction
|
|
45
|
+
assert_equal 'text/html', metadata.mime_type
|
|
46
|
+
assert_equal 'utf-8', metadata.charset
|
|
47
|
+
assert_equal 'Kreuzberg', metadata.generator
|
|
48
|
+
assert_equal '#ffffff', metadata.theme_color
|
|
49
|
+
assert_equal 'Test App', metadata.application_name
|
|
50
|
+
assert_equal 'index, follow', metadata.robots
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def test_keywords_is_array
|
|
54
|
+
keywords_array = %w[test metadata array]
|
|
55
|
+
metadata = Kreuzberg::HtmlMetadata.new(
|
|
56
|
+
title: nil,
|
|
57
|
+
description: nil,
|
|
58
|
+
author: nil,
|
|
59
|
+
copyright: nil,
|
|
60
|
+
keywords: keywords_array,
|
|
61
|
+
canonical_url: nil,
|
|
62
|
+
language: nil,
|
|
63
|
+
text_direction: nil,
|
|
64
|
+
mime_type: nil,
|
|
65
|
+
charset: nil,
|
|
66
|
+
generator: nil,
|
|
67
|
+
viewport: nil,
|
|
68
|
+
theme_color: nil,
|
|
69
|
+
application_name: nil,
|
|
70
|
+
robots: nil,
|
|
71
|
+
open_graph: {},
|
|
72
|
+
twitter_card: {},
|
|
73
|
+
meta_tags: {},
|
|
74
|
+
headers: [],
|
|
75
|
+
links: [],
|
|
76
|
+
images: [],
|
|
77
|
+
structured_data: []
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
assert_instance_of Array, metadata.keywords
|
|
81
|
+
assert_equal keywords_array, metadata.keywords
|
|
82
|
+
metadata.keywords.each { |keyword| assert_instance_of String, keyword }
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def test_canonical_url_renamed
|
|
86
|
+
metadata = Kreuzberg::HtmlMetadata.new(
|
|
87
|
+
title: nil,
|
|
88
|
+
description: nil,
|
|
89
|
+
author: nil,
|
|
90
|
+
copyright: nil,
|
|
91
|
+
keywords: [],
|
|
92
|
+
canonical_url: 'https://example.com/canonical',
|
|
93
|
+
language: nil,
|
|
94
|
+
text_direction: nil,
|
|
95
|
+
mime_type: nil,
|
|
96
|
+
charset: nil,
|
|
97
|
+
generator: nil,
|
|
98
|
+
viewport: nil,
|
|
99
|
+
theme_color: nil,
|
|
100
|
+
application_name: nil,
|
|
101
|
+
robots: nil,
|
|
102
|
+
open_graph: {},
|
|
103
|
+
twitter_card: {},
|
|
104
|
+
meta_tags: {},
|
|
105
|
+
headers: [],
|
|
106
|
+
links: [],
|
|
107
|
+
images: [],
|
|
108
|
+
structured_data: []
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
assert_equal 'https://example.com/canonical', metadata.canonical_url
|
|
112
|
+
assert_respond_to metadata, :canonical_url
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def test_open_graph_is_hash
|
|
116
|
+
og_tags = {
|
|
117
|
+
'og:title' => 'Test Title',
|
|
118
|
+
'og:description' => 'Test Description',
|
|
119
|
+
'og:image' => 'https://example.com/image.jpg',
|
|
120
|
+
'og:url' => 'https://example.com'
|
|
121
|
+
}
|
|
122
|
+
metadata = Kreuzberg::HtmlMetadata.new(
|
|
123
|
+
title: nil,
|
|
124
|
+
description: nil,
|
|
125
|
+
author: nil,
|
|
126
|
+
copyright: nil,
|
|
127
|
+
keywords: [],
|
|
128
|
+
canonical_url: nil,
|
|
129
|
+
language: nil,
|
|
130
|
+
text_direction: nil,
|
|
131
|
+
mime_type: nil,
|
|
132
|
+
charset: nil,
|
|
133
|
+
generator: nil,
|
|
134
|
+
viewport: nil,
|
|
135
|
+
theme_color: nil,
|
|
136
|
+
application_name: nil,
|
|
137
|
+
robots: nil,
|
|
138
|
+
open_graph: og_tags,
|
|
139
|
+
twitter_card: {},
|
|
140
|
+
meta_tags: {},
|
|
141
|
+
headers: [],
|
|
142
|
+
links: [],
|
|
143
|
+
images: [],
|
|
144
|
+
structured_data: []
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
assert_instance_of Hash, metadata.open_graph
|
|
148
|
+
assert_equal og_tags, metadata.open_graph
|
|
149
|
+
metadata.open_graph.each do |key, value|
|
|
150
|
+
assert_instance_of String, key
|
|
151
|
+
assert_instance_of String, value
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def test_twitter_card_is_hash
|
|
156
|
+
twitter_tags = {
|
|
157
|
+
'twitter:card' => 'summary_large_image',
|
|
158
|
+
'twitter:title' => 'Test',
|
|
159
|
+
'twitter:description' => 'Description',
|
|
160
|
+
'twitter:image' => 'https://example.com/image.jpg'
|
|
161
|
+
}
|
|
162
|
+
metadata = Kreuzberg::HtmlMetadata.new(
|
|
163
|
+
title: nil,
|
|
164
|
+
description: nil,
|
|
165
|
+
author: nil,
|
|
166
|
+
copyright: nil,
|
|
167
|
+
keywords: [],
|
|
168
|
+
canonical_url: nil,
|
|
169
|
+
language: nil,
|
|
170
|
+
text_direction: nil,
|
|
171
|
+
mime_type: nil,
|
|
172
|
+
charset: nil,
|
|
173
|
+
generator: nil,
|
|
174
|
+
viewport: nil,
|
|
175
|
+
theme_color: nil,
|
|
176
|
+
application_name: nil,
|
|
177
|
+
robots: nil,
|
|
178
|
+
open_graph: {},
|
|
179
|
+
twitter_card: twitter_tags,
|
|
180
|
+
meta_tags: {},
|
|
181
|
+
headers: [],
|
|
182
|
+
links: [],
|
|
183
|
+
images: [],
|
|
184
|
+
structured_data: []
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
assert_instance_of Hash, metadata.twitter_card
|
|
188
|
+
assert_equal twitter_tags, metadata.twitter_card
|
|
189
|
+
metadata.twitter_card.each do |key, value|
|
|
190
|
+
assert_instance_of String, key
|
|
191
|
+
assert_instance_of String, value
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# ============================================================================
|
|
196
|
+
# T::Struct Behavior Tests
|
|
197
|
+
# ============================================================================
|
|
198
|
+
|
|
199
|
+
def test_header_metadata_creation
|
|
200
|
+
header = Kreuzberg::HeaderMetadata.new(
|
|
201
|
+
level: 1,
|
|
202
|
+
text: 'Main Title',
|
|
203
|
+
id: 'main-title',
|
|
204
|
+
depth: 0,
|
|
205
|
+
html_offset: 245
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
assert_equal 1, header.level
|
|
209
|
+
assert_equal 'Main Title', header.text
|
|
210
|
+
assert_equal 'main-title', header.id
|
|
211
|
+
assert_equal 0, header.depth
|
|
212
|
+
assert_equal 245, header.html_offset
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
def test_header_metadata_nil_id
|
|
216
|
+
header = Kreuzberg::HeaderMetadata.new(
|
|
217
|
+
level: 2,
|
|
218
|
+
text: 'Subtitle',
|
|
219
|
+
id: nil,
|
|
220
|
+
depth: 1,
|
|
221
|
+
html_offset: 456
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
assert_equal 2, header.level
|
|
225
|
+
assert_equal 'Subtitle', header.text
|
|
226
|
+
assert_nil header.id
|
|
227
|
+
assert_equal 1, header.depth
|
|
228
|
+
assert_equal 456, header.html_offset
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
def test_link_metadata_creation
|
|
232
|
+
link = Kreuzberg::LinkMetadata.new(
|
|
233
|
+
href: 'https://example.com',
|
|
234
|
+
text: 'Example',
|
|
235
|
+
title: 'Example Site',
|
|
236
|
+
link_type: 'external',
|
|
237
|
+
rel: %w[noopener noreferrer],
|
|
238
|
+
attributes: { 'data-id' => '123', 'class' => 'external-link' }
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
assert_equal 'https://example.com', link.href
|
|
242
|
+
assert_equal 'Example', link.text
|
|
243
|
+
assert_equal 'Example Site', link.title
|
|
244
|
+
assert_equal 'external', link.link_type
|
|
245
|
+
assert_instance_of Array, link.rel
|
|
246
|
+
assert_equal %w[noopener noreferrer], link.rel
|
|
247
|
+
assert_instance_of Hash, link.attributes
|
|
248
|
+
assert_equal '123', link.attributes['data-id']
|
|
249
|
+
assert_equal 'external-link', link.attributes['class']
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
def test_link_metadata_empty_arrays_and_hashes
|
|
253
|
+
link = Kreuzberg::LinkMetadata.new(
|
|
254
|
+
href: 'https://example.com',
|
|
255
|
+
text: 'Link',
|
|
256
|
+
title: nil,
|
|
257
|
+
link_type: 'internal',
|
|
258
|
+
rel: [],
|
|
259
|
+
attributes: {}
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
assert_equal 'https://example.com', link.href
|
|
263
|
+
assert_empty link.rel
|
|
264
|
+
assert_empty link.attributes
|
|
265
|
+
assert_nil link.title
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
def test_image_metadata_creation
|
|
269
|
+
image = Kreuzberg::ImageMetadata.new(
|
|
270
|
+
src: 'images/logo.png',
|
|
271
|
+
alt: 'Company Logo',
|
|
272
|
+
title: nil,
|
|
273
|
+
dimensions: [200, 100],
|
|
274
|
+
image_type: 'png',
|
|
275
|
+
attributes: { 'loading' => 'lazy', 'class' => 'logo' }
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
assert_equal 'images/logo.png', image.src
|
|
279
|
+
assert_equal 'Company Logo', image.alt
|
|
280
|
+
assert_nil image.title
|
|
281
|
+
assert_instance_of Array, image.dimensions
|
|
282
|
+
assert_equal [200, 100], image.dimensions
|
|
283
|
+
assert_equal 'png', image.image_type
|
|
284
|
+
assert_instance_of Hash, image.attributes
|
|
285
|
+
assert_equal 'lazy', image.attributes['loading']
|
|
286
|
+
end
|
|
287
|
+
|
|
288
|
+
def test_image_metadata_nil_dimensions
|
|
289
|
+
image = Kreuzberg::ImageMetadata.new(
|
|
290
|
+
src: 'image.jpg',
|
|
291
|
+
alt: 'Description',
|
|
292
|
+
title: 'Title',
|
|
293
|
+
dimensions: nil,
|
|
294
|
+
image_type: 'jpg',
|
|
295
|
+
attributes: {}
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
assert_equal 'image.jpg', image.src
|
|
299
|
+
assert_nil image.dimensions
|
|
300
|
+
assert_equal 'jpg', image.image_type
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
def test_structured_data_creation
|
|
304
|
+
json_data = '{"@context":"https://schema.org","@type":"Article","headline":"Test Article"}'
|
|
305
|
+
structured = Kreuzberg::StructuredData.new(
|
|
306
|
+
data_type: 'json-ld',
|
|
307
|
+
raw_json: json_data,
|
|
308
|
+
schema_type: 'Article'
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
assert_equal 'json-ld', structured.data_type
|
|
312
|
+
assert_equal json_data, structured.raw_json
|
|
313
|
+
assert_equal 'Article', structured.schema_type
|
|
314
|
+
parsed = JSON.parse(structured.raw_json)
|
|
315
|
+
assert_equal 'Article', parsed['@type']
|
|
316
|
+
end
|
|
317
|
+
|
|
318
|
+
def test_structured_data_nil_schema_type
|
|
319
|
+
json_data = '{"data":"value"}'
|
|
320
|
+
structured = Kreuzberg::StructuredData.new(
|
|
321
|
+
data_type: 'microdata',
|
|
322
|
+
raw_json: json_data,
|
|
323
|
+
schema_type: nil
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
assert_equal 'microdata', structured.data_type
|
|
327
|
+
assert_nil structured.schema_type
|
|
328
|
+
end
|
|
329
|
+
|
|
330
|
+
# ============================================================================
|
|
331
|
+
# Integration Tests
|
|
332
|
+
# ============================================================================
|
|
333
|
+
|
|
334
|
+
def test_extract_html_returns_metadata
|
|
335
|
+
html_file = create_test_html_file(
|
|
336
|
+
'<html><head><title>Test Page</title></head><body><p>Content</p></body></html>'
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
begin
|
|
340
|
+
result = Kreuzberg.extract_file_sync(html_file)
|
|
341
|
+
assert_instance_of Kreuzberg::Result, result
|
|
342
|
+
assert_not_nil result.metadata
|
|
343
|
+
|
|
344
|
+
if result.metadata.is_a?(Hash)
|
|
345
|
+
assert result.metadata.is_a?(Hash)
|
|
346
|
+
elsif result.metadata.is_a?(Kreuzberg::HtmlMetadata)
|
|
347
|
+
assert result.metadata.is_a?(Kreuzberg::HtmlMetadata)
|
|
348
|
+
end
|
|
349
|
+
ensure
|
|
350
|
+
FileUtils.rm_f(html_file)
|
|
351
|
+
end
|
|
352
|
+
end
|
|
353
|
+
|
|
354
|
+
def test_metadata_keywords_array
|
|
355
|
+
html_content = <<~HTML
|
|
356
|
+
<html>
|
|
357
|
+
<head>
|
|
358
|
+
<title>Test</title>
|
|
359
|
+
<meta name="keywords" content="ruby, testing, metadata">
|
|
360
|
+
</head>
|
|
361
|
+
<body></body>
|
|
362
|
+
</html>
|
|
363
|
+
HTML
|
|
364
|
+
html_file = create_test_html_file(html_content)
|
|
365
|
+
|
|
366
|
+
begin
|
|
367
|
+
result = Kreuzberg.extract_file_sync(html_file)
|
|
368
|
+
metadata = result.metadata
|
|
369
|
+
|
|
370
|
+
if metadata.is_a?(Hash) && metadata['keywords']
|
|
371
|
+
assert metadata['keywords'].is_a?(Array)
|
|
372
|
+
elsif metadata.is_a?(Kreuzberg::HtmlMetadata)
|
|
373
|
+
assert_instance_of Array, metadata.keywords
|
|
374
|
+
end
|
|
375
|
+
ensure
|
|
376
|
+
FileUtils.rm_f(html_file)
|
|
377
|
+
end
|
|
378
|
+
end
|
|
379
|
+
|
|
380
|
+
def test_metadata_open_graph_hash
|
|
381
|
+
html_content = <<~HTML
|
|
382
|
+
<html>
|
|
383
|
+
<head>
|
|
384
|
+
<title>Test</title>
|
|
385
|
+
<meta property="og:title" content="Test Title">
|
|
386
|
+
<meta property="og:description" content="Test Description">
|
|
387
|
+
<meta property="og:image" content="https://example.com/image.jpg">
|
|
388
|
+
</head>
|
|
389
|
+
<body></body>
|
|
390
|
+
</html>
|
|
391
|
+
HTML
|
|
392
|
+
html_file = create_test_html_file(html_content)
|
|
393
|
+
|
|
394
|
+
begin
|
|
395
|
+
result = Kreuzberg.extract_file_sync(html_file)
|
|
396
|
+
metadata = result.metadata
|
|
397
|
+
|
|
398
|
+
if metadata.is_a?(Hash) && metadata['open_graph']
|
|
399
|
+
assert metadata['open_graph'].is_a?(Hash)
|
|
400
|
+
elsif metadata.is_a?(Kreuzberg::HtmlMetadata)
|
|
401
|
+
assert_instance_of Hash, metadata.open_graph
|
|
402
|
+
end
|
|
403
|
+
ensure
|
|
404
|
+
FileUtils.rm_f(html_file)
|
|
405
|
+
end
|
|
406
|
+
end
|
|
407
|
+
|
|
408
|
+
def test_metadata_headers_array
|
|
409
|
+
html_content = <<~HTML
|
|
410
|
+
<html>
|
|
411
|
+
<head><title>Test</title></head>
|
|
412
|
+
<body>
|
|
413
|
+
<h1>Main Title</h1>
|
|
414
|
+
<h2>Subtitle</h2>
|
|
415
|
+
<h3 id="section-1">Section 1</h3>
|
|
416
|
+
</body>
|
|
417
|
+
</html>
|
|
418
|
+
HTML
|
|
419
|
+
html_file = create_test_html_file(html_content)
|
|
420
|
+
|
|
421
|
+
begin
|
|
422
|
+
result = Kreuzberg.extract_file_sync(html_file)
|
|
423
|
+
metadata = result.metadata
|
|
424
|
+
|
|
425
|
+
if metadata.is_a?(Hash) && metadata['headers']
|
|
426
|
+
assert metadata['headers'].is_a?(Array)
|
|
427
|
+
elsif metadata.is_a?(Kreuzberg::HtmlMetadata)
|
|
428
|
+
assert_instance_of Array, metadata.headers
|
|
429
|
+
end
|
|
430
|
+
ensure
|
|
431
|
+
FileUtils.rm_f(html_file)
|
|
432
|
+
end
|
|
433
|
+
end
|
|
434
|
+
|
|
435
|
+
def test_metadata_links_array
|
|
436
|
+
html_content = <<~HTML
|
|
437
|
+
<html>
|
|
438
|
+
<head><title>Test</title></head>
|
|
439
|
+
<body>
|
|
440
|
+
<a href="https://example.com">External Link</a>
|
|
441
|
+
<a href="/page">Internal Link</a>
|
|
442
|
+
<a href="#section">Anchor Link</a>
|
|
443
|
+
</body>
|
|
444
|
+
</html>
|
|
445
|
+
HTML
|
|
446
|
+
html_file = create_test_html_file(html_content)
|
|
447
|
+
|
|
448
|
+
begin
|
|
449
|
+
result = Kreuzberg.extract_file_sync(html_file)
|
|
450
|
+
metadata = result.metadata
|
|
451
|
+
|
|
452
|
+
if metadata.is_a?(Hash) && metadata['links']
|
|
453
|
+
assert metadata['links'].is_a?(Array)
|
|
454
|
+
elsif metadata.is_a?(Kreuzberg::HtmlMetadata)
|
|
455
|
+
assert_instance_of Array, metadata.links
|
|
456
|
+
end
|
|
457
|
+
ensure
|
|
458
|
+
FileUtils.rm_f(html_file)
|
|
459
|
+
end
|
|
460
|
+
end
|
|
461
|
+
|
|
462
|
+
def test_metadata_images_array
|
|
463
|
+
html_content = <<~HTML
|
|
464
|
+
<html>
|
|
465
|
+
<head><title>Test</title></head>
|
|
466
|
+
<body>
|
|
467
|
+
<img src="image1.jpg" alt="Image 1" width="200" height="100">
|
|
468
|
+
<img src="image2.png" alt="Image 2">
|
|
469
|
+
<img src="image3.gif">
|
|
470
|
+
</body>
|
|
471
|
+
</html>
|
|
472
|
+
HTML
|
|
473
|
+
html_file = create_test_html_file(html_content)
|
|
474
|
+
|
|
475
|
+
begin
|
|
476
|
+
result = Kreuzberg.extract_file_sync(html_file)
|
|
477
|
+
metadata = result.metadata
|
|
478
|
+
|
|
479
|
+
if metadata.is_a?(Hash) && metadata['images']
|
|
480
|
+
assert metadata['images'].is_a?(Array)
|
|
481
|
+
elsif metadata.is_a?(Kreuzberg::HtmlMetadata)
|
|
482
|
+
assert_instance_of Array, metadata.images
|
|
483
|
+
end
|
|
484
|
+
ensure
|
|
485
|
+
FileUtils.rm_f(html_file)
|
|
486
|
+
end
|
|
487
|
+
end
|
|
488
|
+
|
|
489
|
+
# ============================================================================
|
|
490
|
+
# Edge Cases
|
|
491
|
+
# ============================================================================
|
|
492
|
+
|
|
493
|
+
def test_metadata_empty_html
|
|
494
|
+
html_file = create_test_html_file('<html><body></body></html>')
|
|
495
|
+
|
|
496
|
+
begin
|
|
497
|
+
result = Kreuzberg.extract_file_sync(html_file)
|
|
498
|
+
metadata = result.metadata
|
|
499
|
+
|
|
500
|
+
if metadata.is_a?(Kreuzberg::HtmlMetadata)
|
|
501
|
+
assert_instance_of Array, metadata.keywords
|
|
502
|
+
assert_instance_of Hash, metadata.open_graph
|
|
503
|
+
assert_instance_of Hash, metadata.twitter_card
|
|
504
|
+
assert_instance_of Hash, metadata.meta_tags
|
|
505
|
+
assert_instance_of Array, metadata.headers
|
|
506
|
+
assert_instance_of Array, metadata.links
|
|
507
|
+
assert_instance_of Array, metadata.images
|
|
508
|
+
assert_instance_of Array, metadata.structured_data
|
|
509
|
+
elsif metadata.is_a?(Hash)
|
|
510
|
+
assert_instance_of Array, metadata['keywords'] || []
|
|
511
|
+
assert_instance_of Hash, metadata['open_graph'] || {}
|
|
512
|
+
assert_instance_of Hash, metadata['twitter_card'] || {}
|
|
513
|
+
end
|
|
514
|
+
ensure
|
|
515
|
+
FileUtils.rm_f(html_file)
|
|
516
|
+
end
|
|
517
|
+
end
|
|
518
|
+
|
|
519
|
+
def test_metadata_nil_optional_fields
|
|
520
|
+
metadata = Kreuzberg::HtmlMetadata.new(
|
|
521
|
+
title: nil,
|
|
522
|
+
description: nil,
|
|
523
|
+
author: nil,
|
|
524
|
+
copyright: nil,
|
|
525
|
+
keywords: [],
|
|
526
|
+
canonical_url: nil,
|
|
527
|
+
language: nil,
|
|
528
|
+
text_direction: nil,
|
|
529
|
+
mime_type: nil,
|
|
530
|
+
charset: nil,
|
|
531
|
+
generator: nil,
|
|
532
|
+
viewport: nil,
|
|
533
|
+
theme_color: nil,
|
|
534
|
+
application_name: nil,
|
|
535
|
+
robots: nil,
|
|
536
|
+
open_graph: {},
|
|
537
|
+
twitter_card: {},
|
|
538
|
+
meta_tags: {},
|
|
539
|
+
headers: [],
|
|
540
|
+
links: [],
|
|
541
|
+
images: [],
|
|
542
|
+
structured_data: []
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
assert_nil metadata.title
|
|
546
|
+
assert_nil metadata.description
|
|
547
|
+
assert_nil metadata.author
|
|
548
|
+
assert_nil metadata.copyright
|
|
549
|
+
assert_nil metadata.canonical_url
|
|
550
|
+
assert_nil metadata.language
|
|
551
|
+
assert_nil metadata.text_direction
|
|
552
|
+
assert_nil metadata.mime_type
|
|
553
|
+
assert_nil metadata.charset
|
|
554
|
+
assert_nil metadata.generator
|
|
555
|
+
assert_nil metadata.viewport
|
|
556
|
+
assert_nil metadata.theme_color
|
|
557
|
+
assert_nil metadata.application_name
|
|
558
|
+
assert_nil metadata.robots
|
|
559
|
+
end
|
|
560
|
+
|
|
561
|
+
def test_metadata_empty_collections
|
|
562
|
+
metadata = Kreuzberg::HtmlMetadata.new(
|
|
563
|
+
title: nil,
|
|
564
|
+
description: nil,
|
|
565
|
+
author: nil,
|
|
566
|
+
copyright: nil,
|
|
567
|
+
keywords: [],
|
|
568
|
+
canonical_url: nil,
|
|
569
|
+
language: nil,
|
|
570
|
+
text_direction: nil,
|
|
571
|
+
mime_type: nil,
|
|
572
|
+
charset: nil,
|
|
573
|
+
generator: nil,
|
|
574
|
+
viewport: nil,
|
|
575
|
+
theme_color: nil,
|
|
576
|
+
application_name: nil,
|
|
577
|
+
robots: nil,
|
|
578
|
+
open_graph: {},
|
|
579
|
+
twitter_card: {},
|
|
580
|
+
meta_tags: {},
|
|
581
|
+
headers: [],
|
|
582
|
+
links: [],
|
|
583
|
+
images: [],
|
|
584
|
+
structured_data: []
|
|
585
|
+
)
|
|
586
|
+
|
|
587
|
+
assert_empty metadata.keywords
|
|
588
|
+
assert_empty metadata.open_graph
|
|
589
|
+
assert_empty metadata.twitter_card
|
|
590
|
+
assert_empty metadata.meta_tags
|
|
591
|
+
assert_empty metadata.headers
|
|
592
|
+
assert_empty metadata.links
|
|
593
|
+
assert_empty metadata.images
|
|
594
|
+
assert_empty metadata.structured_data
|
|
595
|
+
end
|
|
596
|
+
|
|
597
|
+
# ============================================================================
|
|
598
|
+
# Sorbet Type Safety
|
|
599
|
+
# ============================================================================
|
|
600
|
+
|
|
601
|
+
def test_type_checking_enabled
|
|
602
|
+
metadata = Kreuzberg::HtmlMetadata.new(
|
|
603
|
+
title: 'Test',
|
|
604
|
+
description: nil,
|
|
605
|
+
author: nil,
|
|
606
|
+
copyright: nil,
|
|
607
|
+
keywords: ['test'],
|
|
608
|
+
canonical_url: nil,
|
|
609
|
+
language: nil,
|
|
610
|
+
text_direction: nil,
|
|
611
|
+
mime_type: nil,
|
|
612
|
+
charset: nil,
|
|
613
|
+
generator: nil,
|
|
614
|
+
viewport: nil,
|
|
615
|
+
theme_color: nil,
|
|
616
|
+
application_name: nil,
|
|
617
|
+
robots: nil,
|
|
618
|
+
open_graph: {},
|
|
619
|
+
twitter_card: {},
|
|
620
|
+
meta_tags: {},
|
|
621
|
+
headers: [],
|
|
622
|
+
links: [],
|
|
623
|
+
images: [],
|
|
624
|
+
structured_data: []
|
|
625
|
+
)
|
|
626
|
+
|
|
627
|
+
assert_kind_of Kreuzberg::HtmlMetadata, metadata
|
|
628
|
+
assert metadata.respond_to?(:title)
|
|
629
|
+
assert metadata.respond_to?(:keywords)
|
|
630
|
+
assert metadata.respond_to?(:open_graph)
|
|
631
|
+
end
|
|
632
|
+
|
|
633
|
+
def test_immutable_tstruct_fields
|
|
634
|
+
metadata = Kreuzberg::HtmlMetadata.new(
|
|
635
|
+
title: 'Original',
|
|
636
|
+
description: nil,
|
|
637
|
+
author: nil,
|
|
638
|
+
copyright: nil,
|
|
639
|
+
keywords: [],
|
|
640
|
+
canonical_url: nil,
|
|
641
|
+
language: nil,
|
|
642
|
+
text_direction: nil,
|
|
643
|
+
mime_type: nil,
|
|
644
|
+
charset: nil,
|
|
645
|
+
generator: nil,
|
|
646
|
+
viewport: nil,
|
|
647
|
+
theme_color: nil,
|
|
648
|
+
application_name: nil,
|
|
649
|
+
robots: nil,
|
|
650
|
+
open_graph: {},
|
|
651
|
+
twitter_card: {},
|
|
652
|
+
meta_tags: {},
|
|
653
|
+
headers: [],
|
|
654
|
+
links: [],
|
|
655
|
+
images: [],
|
|
656
|
+
structured_data: []
|
|
657
|
+
)
|
|
658
|
+
|
|
659
|
+
assert_raises(NoMethodError) { metadata.title = 'Modified' }
|
|
660
|
+
end
|
|
661
|
+
|
|
662
|
+
def test_headers_with_multiple_levels
|
|
663
|
+
headers = [
|
|
664
|
+
Kreuzberg::HeaderMetadata.new(level: 1, text: 'H1', id: nil, depth: 0, html_offset: 0),
|
|
665
|
+
Kreuzberg::HeaderMetadata.new(level: 2, text: 'H2', id: nil, depth: 1, html_offset: 50),
|
|
666
|
+
Kreuzberg::HeaderMetadata.new(level: 3, text: 'H3', id: 'sec-1', depth: 2, html_offset: 100),
|
|
667
|
+
Kreuzberg::HeaderMetadata.new(level: 2, text: 'H2-2', id: nil, depth: 1, html_offset: 150)
|
|
668
|
+
]
|
|
669
|
+
|
|
670
|
+
metadata = Kreuzberg::HtmlMetadata.new(
|
|
671
|
+
title: nil,
|
|
672
|
+
description: nil,
|
|
673
|
+
author: nil,
|
|
674
|
+
copyright: nil,
|
|
675
|
+
keywords: [],
|
|
676
|
+
canonical_url: nil,
|
|
677
|
+
language: nil,
|
|
678
|
+
text_direction: nil,
|
|
679
|
+
mime_type: nil,
|
|
680
|
+
charset: nil,
|
|
681
|
+
generator: nil,
|
|
682
|
+
viewport: nil,
|
|
683
|
+
theme_color: nil,
|
|
684
|
+
application_name: nil,
|
|
685
|
+
robots: nil,
|
|
686
|
+
open_graph: {},
|
|
687
|
+
twitter_card: {},
|
|
688
|
+
meta_tags: {},
|
|
689
|
+
headers: headers,
|
|
690
|
+
links: [],
|
|
691
|
+
images: [],
|
|
692
|
+
structured_data: []
|
|
693
|
+
)
|
|
694
|
+
|
|
695
|
+
assert_equal 4, metadata.headers.length
|
|
696
|
+
assert_equal 1, metadata.headers[0].level
|
|
697
|
+
assert_equal 3, metadata.headers[2].level
|
|
698
|
+
assert_equal 'sec-1', metadata.headers[2].id
|
|
699
|
+
end
|
|
700
|
+
|
|
701
|
+
def test_links_with_various_types
|
|
702
|
+
links = [
|
|
703
|
+
Kreuzberg::LinkMetadata.new(
|
|
704
|
+
href: 'https://external.com',
|
|
705
|
+
text: 'External',
|
|
706
|
+
title: nil,
|
|
707
|
+
link_type: 'external',
|
|
708
|
+
rel: ['noopener'],
|
|
709
|
+
attributes: {}
|
|
710
|
+
),
|
|
711
|
+
Kreuzberg::LinkMetadata.new(
|
|
712
|
+
href: '/internal/page',
|
|
713
|
+
text: 'Internal',
|
|
714
|
+
title: 'Internal Page',
|
|
715
|
+
link_type: 'internal',
|
|
716
|
+
rel: [],
|
|
717
|
+
attributes: { 'class' => 'nav-link' }
|
|
718
|
+
),
|
|
719
|
+
Kreuzberg::LinkMetadata.new(
|
|
720
|
+
href: '#section',
|
|
721
|
+
text: 'Anchor',
|
|
722
|
+
title: nil,
|
|
723
|
+
link_type: 'anchor',
|
|
724
|
+
rel: [],
|
|
725
|
+
attributes: {}
|
|
726
|
+
)
|
|
727
|
+
]
|
|
728
|
+
|
|
729
|
+
metadata = Kreuzberg::HtmlMetadata.new(
|
|
730
|
+
title: nil,
|
|
731
|
+
description: nil,
|
|
732
|
+
author: nil,
|
|
733
|
+
copyright: nil,
|
|
734
|
+
keywords: [],
|
|
735
|
+
canonical_url: nil,
|
|
736
|
+
language: nil,
|
|
737
|
+
text_direction: nil,
|
|
738
|
+
mime_type: nil,
|
|
739
|
+
charset: nil,
|
|
740
|
+
generator: nil,
|
|
741
|
+
viewport: nil,
|
|
742
|
+
theme_color: nil,
|
|
743
|
+
application_name: nil,
|
|
744
|
+
robots: nil,
|
|
745
|
+
open_graph: {},
|
|
746
|
+
twitter_card: {},
|
|
747
|
+
meta_tags: {},
|
|
748
|
+
headers: [],
|
|
749
|
+
links: links,
|
|
750
|
+
images: [],
|
|
751
|
+
structured_data: []
|
|
752
|
+
)
|
|
753
|
+
|
|
754
|
+
assert_equal 3, metadata.links.length
|
|
755
|
+
assert_equal 'external', metadata.links[0].link_type
|
|
756
|
+
assert_equal 'internal', metadata.links[1].link_type
|
|
757
|
+
assert_equal 'anchor', metadata.links[2].link_type
|
|
758
|
+
assert_equal 'nav-link', metadata.links[1].attributes['class']
|
|
759
|
+
end
|
|
760
|
+
|
|
761
|
+
def test_images_with_attributes
|
|
762
|
+
images = [
|
|
763
|
+
Kreuzberg::ImageMetadata.new(
|
|
764
|
+
src: 'logo.png',
|
|
765
|
+
alt: 'Logo',
|
|
766
|
+
title: nil,
|
|
767
|
+
dimensions: [200, 100],
|
|
768
|
+
image_type: 'png',
|
|
769
|
+
attributes: { 'class' => 'logo', 'loading' => 'eager' }
|
|
770
|
+
),
|
|
771
|
+
Kreuzberg::ImageMetadata.new(
|
|
772
|
+
src: 'thumbnail.jpg',
|
|
773
|
+
alt: nil,
|
|
774
|
+
title: 'Thumbnail',
|
|
775
|
+
dimensions: nil,
|
|
776
|
+
image_type: 'jpg',
|
|
777
|
+
attributes: { 'loading' => 'lazy', 'decoding' => 'async' }
|
|
778
|
+
)
|
|
779
|
+
]
|
|
780
|
+
|
|
781
|
+
metadata = Kreuzberg::HtmlMetadata.new(
|
|
782
|
+
title: nil,
|
|
783
|
+
description: nil,
|
|
784
|
+
author: nil,
|
|
785
|
+
copyright: nil,
|
|
786
|
+
keywords: [],
|
|
787
|
+
canonical_url: nil,
|
|
788
|
+
language: nil,
|
|
789
|
+
text_direction: nil,
|
|
790
|
+
mime_type: nil,
|
|
791
|
+
charset: nil,
|
|
792
|
+
generator: nil,
|
|
793
|
+
viewport: nil,
|
|
794
|
+
theme_color: nil,
|
|
795
|
+
application_name: nil,
|
|
796
|
+
robots: nil,
|
|
797
|
+
open_graph: {},
|
|
798
|
+
twitter_card: {},
|
|
799
|
+
meta_tags: {},
|
|
800
|
+
headers: [],
|
|
801
|
+
links: [],
|
|
802
|
+
images: images,
|
|
803
|
+
structured_data: []
|
|
804
|
+
)
|
|
805
|
+
|
|
806
|
+
assert_equal 2, metadata.images.length
|
|
807
|
+
assert_equal [200, 100], metadata.images[0].dimensions
|
|
808
|
+
assert_nil metadata.images[1].dimensions
|
|
809
|
+
assert_equal 'lazy', metadata.images[1].attributes['loading']
|
|
810
|
+
end
|
|
811
|
+
|
|
812
|
+
def test_structured_data_multiple_types
|
|
813
|
+
json_ld = '{"@context":"https://schema.org","@type":"Article"}'
|
|
814
|
+
microdata = '{"type":"http://schema.org/Person"}'
|
|
815
|
+
|
|
816
|
+
structured_data = [
|
|
817
|
+
Kreuzberg::StructuredData.new(
|
|
818
|
+
data_type: 'json-ld',
|
|
819
|
+
raw_json: json_ld,
|
|
820
|
+
schema_type: 'Article'
|
|
821
|
+
),
|
|
822
|
+
Kreuzberg::StructuredData.new(
|
|
823
|
+
data_type: 'microdata',
|
|
824
|
+
raw_json: microdata,
|
|
825
|
+
schema_type: 'Person'
|
|
826
|
+
),
|
|
827
|
+
Kreuzberg::StructuredData.new(
|
|
828
|
+
data_type: 'json-ld',
|
|
829
|
+
raw_json: '{"@type":"Organization"}',
|
|
830
|
+
schema_type: nil
|
|
831
|
+
)
|
|
832
|
+
]
|
|
833
|
+
|
|
834
|
+
metadata = Kreuzberg::HtmlMetadata.new(
|
|
835
|
+
title: nil,
|
|
836
|
+
description: nil,
|
|
837
|
+
author: nil,
|
|
838
|
+
copyright: nil,
|
|
839
|
+
keywords: [],
|
|
840
|
+
canonical_url: nil,
|
|
841
|
+
language: nil,
|
|
842
|
+
text_direction: nil,
|
|
843
|
+
mime_type: nil,
|
|
844
|
+
charset: nil,
|
|
845
|
+
generator: nil,
|
|
846
|
+
viewport: nil,
|
|
847
|
+
theme_color: nil,
|
|
848
|
+
application_name: nil,
|
|
849
|
+
robots: nil,
|
|
850
|
+
open_graph: {},
|
|
851
|
+
twitter_card: {},
|
|
852
|
+
meta_tags: {},
|
|
853
|
+
headers: [],
|
|
854
|
+
links: [],
|
|
855
|
+
images: [],
|
|
856
|
+
structured_data: structured_data
|
|
857
|
+
)
|
|
858
|
+
|
|
859
|
+
assert_equal 3, metadata.structured_data.length
|
|
860
|
+
assert_equal 'json-ld', metadata.structured_data[0].data_type
|
|
861
|
+
assert_equal 'Article', metadata.structured_data[0].schema_type
|
|
862
|
+
assert_equal 'microdata', metadata.structured_data[1].data_type
|
|
863
|
+
assert_nil metadata.structured_data[2].schema_type
|
|
864
|
+
end
|
|
865
|
+
|
|
866
|
+
def test_html_metadata_with_all_fields_populated
|
|
867
|
+
headers = [
|
|
868
|
+
Kreuzberg::HeaderMetadata.new(level: 1, text: 'Title', id: 'title', depth: 0, html_offset: 100)
|
|
869
|
+
]
|
|
870
|
+
links = [
|
|
871
|
+
Kreuzberg::LinkMetadata.new(
|
|
872
|
+
href: 'https://example.com',
|
|
873
|
+
text: 'Example',
|
|
874
|
+
title: 'Example Site',
|
|
875
|
+
link_type: 'external',
|
|
876
|
+
rel: ['noopener'],
|
|
877
|
+
attributes: { 'data-track' => 'true' }
|
|
878
|
+
)
|
|
879
|
+
]
|
|
880
|
+
images = [
|
|
881
|
+
Kreuzberg::ImageMetadata.new(
|
|
882
|
+
src: 'image.jpg',
|
|
883
|
+
alt: 'Test Image',
|
|
884
|
+
title: nil,
|
|
885
|
+
dimensions: [300, 200],
|
|
886
|
+
image_type: 'jpg',
|
|
887
|
+
attributes: { 'loading' => 'lazy' }
|
|
888
|
+
)
|
|
889
|
+
]
|
|
890
|
+
structured = [
|
|
891
|
+
Kreuzberg::StructuredData.new(
|
|
892
|
+
data_type: 'json-ld',
|
|
893
|
+
raw_json: '{"@type":"WebPage"}',
|
|
894
|
+
schema_type: 'WebPage'
|
|
895
|
+
)
|
|
896
|
+
]
|
|
897
|
+
|
|
898
|
+
metadata = Kreuzberg::HtmlMetadata.new(
|
|
899
|
+
title: 'Complete Test Page',
|
|
900
|
+
description: 'A complete test page with all metadata',
|
|
901
|
+
author: 'Test Author',
|
|
902
|
+
copyright: '2024 Test Corp',
|
|
903
|
+
keywords: %w[test comprehensive metadata],
|
|
904
|
+
canonical_url: 'https://example.com/test',
|
|
905
|
+
language: 'en',
|
|
906
|
+
text_direction: 'ltr',
|
|
907
|
+
mime_type: 'text/html; charset=utf-8',
|
|
908
|
+
charset: 'utf-8',
|
|
909
|
+
generator: 'Kreuzberg',
|
|
910
|
+
viewport: 'width=device-width, initial-scale=1',
|
|
911
|
+
theme_color: '#ffffff',
|
|
912
|
+
application_name: 'Test App',
|
|
913
|
+
robots: 'index, follow',
|
|
914
|
+
open_graph: {
|
|
915
|
+
'og:title' => 'Test',
|
|
916
|
+
'og:description' => 'Description',
|
|
917
|
+
'og:image' => 'https://example.com/image.jpg'
|
|
918
|
+
},
|
|
919
|
+
twitter_card: {
|
|
920
|
+
'twitter:card' => 'summary_large_image',
|
|
921
|
+
'twitter:title' => 'Test'
|
|
922
|
+
},
|
|
923
|
+
meta_tags: {
|
|
924
|
+
'custom-tag' => 'custom-value'
|
|
925
|
+
},
|
|
926
|
+
headers: headers,
|
|
927
|
+
links: links,
|
|
928
|
+
images: images,
|
|
929
|
+
structured_data: structured
|
|
930
|
+
)
|
|
931
|
+
|
|
932
|
+
assert_equal 'Complete Test Page', metadata.title
|
|
933
|
+
assert_equal 'A complete test page with all metadata', metadata.description
|
|
934
|
+
assert_equal 'Test Author', metadata.author
|
|
935
|
+
assert_equal '2024 Test Corp', metadata.copyright
|
|
936
|
+
assert_equal 3, metadata.keywords.length
|
|
937
|
+
assert_equal 'https://example.com/test', metadata.canonical_url
|
|
938
|
+
assert_equal 'en', metadata.language
|
|
939
|
+
assert_equal 'ltr', metadata.text_direction
|
|
940
|
+
assert_equal 'Kreuzberg', metadata.generator
|
|
941
|
+
assert_equal 3, metadata.open_graph.length
|
|
942
|
+
assert_equal 2, metadata.twitter_card.length
|
|
943
|
+
assert_equal 1, metadata.meta_tags.length
|
|
944
|
+
assert_equal 1, metadata.headers.length
|
|
945
|
+
assert_equal 1, metadata.links.length
|
|
946
|
+
assert_equal 1, metadata.images.length
|
|
947
|
+
assert_equal 1, metadata.structured_data.length
|
|
948
|
+
end
|
|
949
|
+
|
|
950
|
+
private
|
|
951
|
+
|
|
952
|
+
def create_test_html_file(content)
|
|
953
|
+
file = Tempfile.new(['test', '.html'])
|
|
954
|
+
file.write(content)
|
|
955
|
+
file.close
|
|
956
|
+
file.path
|
|
957
|
+
end
|
|
958
|
+
end
|
|
959
|
+
# rubocop:enable Metrics/ClassLength, Metrics/MethodLength, Metrics/AbcSize
|