kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -1,959 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'minitest/autorun'
4
- require 'kreuzberg'
5
- require 'json'
6
- require 'tempfile'
7
-
8
- # Comprehensive tests for Kreuzberg metadata types
9
- # Tests verify T::Struct behavior, type safety, and integration with extraction
10
- # rubocop:disable Metrics/ClassLength, Metrics/MethodLength, Metrics/AbcSize
11
- class MetadataTypesTest < Minitest::Test
12
- def test_html_metadata_structure
13
- metadata = Kreuzberg::HtmlMetadata.new(
14
- title: 'Test Page',
15
- description: 'A test description',
16
- author: 'Test Author',
17
- copyright: '2024 Test Corp',
18
- keywords: %w[test metadata],
19
- canonical_url: 'https://example.com/test',
20
- language: 'en',
21
- text_direction: 'ltr',
22
- mime_type: 'text/html',
23
- charset: 'utf-8',
24
- generator: 'Kreuzberg',
25
- viewport: 'width=device-width, initial-scale=1',
26
- theme_color: '#ffffff',
27
- application_name: 'Test App',
28
- robots: 'index, follow',
29
- open_graph: { 'og:title' => 'Test', 'og:image' => 'image.jpg' },
30
- twitter_card: { 'twitter:card' => 'summary' },
31
- meta_tags: { 'custom' => 'value' },
32
- headers: [],
33
- links: [],
34
- images: [],
35
- structured_data: []
36
- )
37
-
38
- assert_equal 'Test Page', metadata.title
39
- assert_equal 'A test description', metadata.description
40
- assert_equal 'Test Author', metadata.author
41
- assert_equal '2024 Test Corp', metadata.copyright
42
- assert_equal 'https://example.com/test', metadata.canonical_url
43
- assert_equal 'en', metadata.language
44
- assert_equal 'ltr', metadata.text_direction
45
- assert_equal 'text/html', metadata.mime_type
46
- assert_equal 'utf-8', metadata.charset
47
- assert_equal 'Kreuzberg', metadata.generator
48
- assert_equal '#ffffff', metadata.theme_color
49
- assert_equal 'Test App', metadata.application_name
50
- assert_equal 'index, follow', metadata.robots
51
- end
52
-
53
- def test_keywords_is_array
54
- keywords_array = %w[test metadata array]
55
- metadata = Kreuzberg::HtmlMetadata.new(
56
- title: nil,
57
- description: nil,
58
- author: nil,
59
- copyright: nil,
60
- keywords: keywords_array,
61
- canonical_url: nil,
62
- language: nil,
63
- text_direction: nil,
64
- mime_type: nil,
65
- charset: nil,
66
- generator: nil,
67
- viewport: nil,
68
- theme_color: nil,
69
- application_name: nil,
70
- robots: nil,
71
- open_graph: {},
72
- twitter_card: {},
73
- meta_tags: {},
74
- headers: [],
75
- links: [],
76
- images: [],
77
- structured_data: []
78
- )
79
-
80
- assert_instance_of Array, metadata.keywords
81
- assert_equal keywords_array, metadata.keywords
82
- metadata.keywords.each { |keyword| assert_instance_of String, keyword }
83
- end
84
-
85
- def test_canonical_url_renamed
86
- metadata = Kreuzberg::HtmlMetadata.new(
87
- title: nil,
88
- description: nil,
89
- author: nil,
90
- copyright: nil,
91
- keywords: [],
92
- canonical_url: 'https://example.com/canonical',
93
- language: nil,
94
- text_direction: nil,
95
- mime_type: nil,
96
- charset: nil,
97
- generator: nil,
98
- viewport: nil,
99
- theme_color: nil,
100
- application_name: nil,
101
- robots: nil,
102
- open_graph: {},
103
- twitter_card: {},
104
- meta_tags: {},
105
- headers: [],
106
- links: [],
107
- images: [],
108
- structured_data: []
109
- )
110
-
111
- assert_equal 'https://example.com/canonical', metadata.canonical_url
112
- assert_respond_to metadata, :canonical_url
113
- end
114
-
115
- def test_open_graph_is_hash
116
- og_tags = {
117
- 'og:title' => 'Test Title',
118
- 'og:description' => 'Test Description',
119
- 'og:image' => 'https://example.com/image.jpg',
120
- 'og:url' => 'https://example.com'
121
- }
122
- metadata = Kreuzberg::HtmlMetadata.new(
123
- title: nil,
124
- description: nil,
125
- author: nil,
126
- copyright: nil,
127
- keywords: [],
128
- canonical_url: nil,
129
- language: nil,
130
- text_direction: nil,
131
- mime_type: nil,
132
- charset: nil,
133
- generator: nil,
134
- viewport: nil,
135
- theme_color: nil,
136
- application_name: nil,
137
- robots: nil,
138
- open_graph: og_tags,
139
- twitter_card: {},
140
- meta_tags: {},
141
- headers: [],
142
- links: [],
143
- images: [],
144
- structured_data: []
145
- )
146
-
147
- assert_instance_of Hash, metadata.open_graph
148
- assert_equal og_tags, metadata.open_graph
149
- metadata.open_graph.each do |key, value|
150
- assert_instance_of String, key
151
- assert_instance_of String, value
152
- end
153
- end
154
-
155
- def test_twitter_card_is_hash
156
- twitter_tags = {
157
- 'twitter:card' => 'summary_large_image',
158
- 'twitter:title' => 'Test',
159
- 'twitter:description' => 'Description',
160
- 'twitter:image' => 'https://example.com/image.jpg'
161
- }
162
- metadata = Kreuzberg::HtmlMetadata.new(
163
- title: nil,
164
- description: nil,
165
- author: nil,
166
- copyright: nil,
167
- keywords: [],
168
- canonical_url: nil,
169
- language: nil,
170
- text_direction: nil,
171
- mime_type: nil,
172
- charset: nil,
173
- generator: nil,
174
- viewport: nil,
175
- theme_color: nil,
176
- application_name: nil,
177
- robots: nil,
178
- open_graph: {},
179
- twitter_card: twitter_tags,
180
- meta_tags: {},
181
- headers: [],
182
- links: [],
183
- images: [],
184
- structured_data: []
185
- )
186
-
187
- assert_instance_of Hash, metadata.twitter_card
188
- assert_equal twitter_tags, metadata.twitter_card
189
- metadata.twitter_card.each do |key, value|
190
- assert_instance_of String, key
191
- assert_instance_of String, value
192
- end
193
- end
194
-
195
- # ============================================================================
196
- # T::Struct Behavior Tests
197
- # ============================================================================
198
-
199
- def test_header_metadata_creation
200
- header = Kreuzberg::HeaderMetadata.new(
201
- level: 1,
202
- text: 'Main Title',
203
- id: 'main-title',
204
- depth: 0,
205
- html_offset: 245
206
- )
207
-
208
- assert_equal 1, header.level
209
- assert_equal 'Main Title', header.text
210
- assert_equal 'main-title', header.id
211
- assert_equal 0, header.depth
212
- assert_equal 245, header.html_offset
213
- end
214
-
215
- def test_header_metadata_nil_id
216
- header = Kreuzberg::HeaderMetadata.new(
217
- level: 2,
218
- text: 'Subtitle',
219
- id: nil,
220
- depth: 1,
221
- html_offset: 456
222
- )
223
-
224
- assert_equal 2, header.level
225
- assert_equal 'Subtitle', header.text
226
- assert_nil header.id
227
- assert_equal 1, header.depth
228
- assert_equal 456, header.html_offset
229
- end
230
-
231
- def test_link_metadata_creation
232
- link = Kreuzberg::LinkMetadata.new(
233
- href: 'https://example.com',
234
- text: 'Example',
235
- title: 'Example Site',
236
- link_type: 'external',
237
- rel: %w[noopener noreferrer],
238
- attributes: { 'data-id' => '123', 'class' => 'external-link' }
239
- )
240
-
241
- assert_equal 'https://example.com', link.href
242
- assert_equal 'Example', link.text
243
- assert_equal 'Example Site', link.title
244
- assert_equal 'external', link.link_type
245
- assert_instance_of Array, link.rel
246
- assert_equal %w[noopener noreferrer], link.rel
247
- assert_instance_of Hash, link.attributes
248
- assert_equal '123', link.attributes['data-id']
249
- assert_equal 'external-link', link.attributes['class']
250
- end
251
-
252
- def test_link_metadata_empty_arrays_and_hashes
253
- link = Kreuzberg::LinkMetadata.new(
254
- href: 'https://example.com',
255
- text: 'Link',
256
- title: nil,
257
- link_type: 'internal',
258
- rel: [],
259
- attributes: {}
260
- )
261
-
262
- assert_equal 'https://example.com', link.href
263
- assert_empty link.rel
264
- assert_empty link.attributes
265
- assert_nil link.title
266
- end
267
-
268
- def test_image_metadata_creation
269
- image = Kreuzberg::ImageMetadata.new(
270
- src: 'images/logo.png',
271
- alt: 'Company Logo',
272
- title: nil,
273
- dimensions: [200, 100],
274
- image_type: 'png',
275
- attributes: { 'loading' => 'lazy', 'class' => 'logo' }
276
- )
277
-
278
- assert_equal 'images/logo.png', image.src
279
- assert_equal 'Company Logo', image.alt
280
- assert_nil image.title
281
- assert_instance_of Array, image.dimensions
282
- assert_equal [200, 100], image.dimensions
283
- assert_equal 'png', image.image_type
284
- assert_instance_of Hash, image.attributes
285
- assert_equal 'lazy', image.attributes['loading']
286
- end
287
-
288
- def test_image_metadata_nil_dimensions
289
- image = Kreuzberg::ImageMetadata.new(
290
- src: 'image.jpg',
291
- alt: 'Description',
292
- title: 'Title',
293
- dimensions: nil,
294
- image_type: 'jpg',
295
- attributes: {}
296
- )
297
-
298
- assert_equal 'image.jpg', image.src
299
- assert_nil image.dimensions
300
- assert_equal 'jpg', image.image_type
301
- end
302
-
303
- def test_structured_data_creation
304
- json_data = '{"@context":"https://schema.org","@type":"Article","headline":"Test Article"}'
305
- structured = Kreuzberg::StructuredData.new(
306
- data_type: 'json-ld',
307
- raw_json: json_data,
308
- schema_type: 'Article'
309
- )
310
-
311
- assert_equal 'json-ld', structured.data_type
312
- assert_equal json_data, structured.raw_json
313
- assert_equal 'Article', structured.schema_type
314
- parsed = JSON.parse(structured.raw_json)
315
- assert_equal 'Article', parsed['@type']
316
- end
317
-
318
- def test_structured_data_nil_schema_type
319
- json_data = '{"data":"value"}'
320
- structured = Kreuzberg::StructuredData.new(
321
- data_type: 'microdata',
322
- raw_json: json_data,
323
- schema_type: nil
324
- )
325
-
326
- assert_equal 'microdata', structured.data_type
327
- assert_nil structured.schema_type
328
- end
329
-
330
- # ============================================================================
331
- # Integration Tests
332
- # ============================================================================
333
-
334
- def test_extract_html_returns_metadata
335
- html_file = create_test_html_file(
336
- '<html><head><title>Test Page</title></head><body><p>Content</p></body></html>'
337
- )
338
-
339
- begin
340
- result = Kreuzberg.extract_file_sync(html_file)
341
- assert_instance_of Kreuzberg::Result, result
342
- assert_not_nil result.metadata
343
-
344
- if result.metadata.is_a?(Hash)
345
- assert result.metadata.is_a?(Hash)
346
- elsif result.metadata.is_a?(Kreuzberg::HtmlMetadata)
347
- assert result.metadata.is_a?(Kreuzberg::HtmlMetadata)
348
- end
349
- ensure
350
- FileUtils.rm_f(html_file)
351
- end
352
- end
353
-
354
- def test_metadata_keywords_array
355
- html_content = <<~HTML
356
- <html>
357
- <head>
358
- <title>Test</title>
359
- <meta name="keywords" content="ruby, testing, metadata">
360
- </head>
361
- <body></body>
362
- </html>
363
- HTML
364
- html_file = create_test_html_file(html_content)
365
-
366
- begin
367
- result = Kreuzberg.extract_file_sync(html_file)
368
- metadata = result.metadata
369
-
370
- if metadata.is_a?(Hash) && metadata['keywords']
371
- assert metadata['keywords'].is_a?(Array)
372
- elsif metadata.is_a?(Kreuzberg::HtmlMetadata)
373
- assert_instance_of Array, metadata.keywords
374
- end
375
- ensure
376
- FileUtils.rm_f(html_file)
377
- end
378
- end
379
-
380
- def test_metadata_open_graph_hash
381
- html_content = <<~HTML
382
- <html>
383
- <head>
384
- <title>Test</title>
385
- <meta property="og:title" content="Test Title">
386
- <meta property="og:description" content="Test Description">
387
- <meta property="og:image" content="https://example.com/image.jpg">
388
- </head>
389
- <body></body>
390
- </html>
391
- HTML
392
- html_file = create_test_html_file(html_content)
393
-
394
- begin
395
- result = Kreuzberg.extract_file_sync(html_file)
396
- metadata = result.metadata
397
-
398
- if metadata.is_a?(Hash) && metadata['open_graph']
399
- assert metadata['open_graph'].is_a?(Hash)
400
- elsif metadata.is_a?(Kreuzberg::HtmlMetadata)
401
- assert_instance_of Hash, metadata.open_graph
402
- end
403
- ensure
404
- FileUtils.rm_f(html_file)
405
- end
406
- end
407
-
408
- def test_metadata_headers_array
409
- html_content = <<~HTML
410
- <html>
411
- <head><title>Test</title></head>
412
- <body>
413
- <h1>Main Title</h1>
414
- <h2>Subtitle</h2>
415
- <h3 id="section-1">Section 1</h3>
416
- </body>
417
- </html>
418
- HTML
419
- html_file = create_test_html_file(html_content)
420
-
421
- begin
422
- result = Kreuzberg.extract_file_sync(html_file)
423
- metadata = result.metadata
424
-
425
- if metadata.is_a?(Hash) && metadata['headers']
426
- assert metadata['headers'].is_a?(Array)
427
- elsif metadata.is_a?(Kreuzberg::HtmlMetadata)
428
- assert_instance_of Array, metadata.headers
429
- end
430
- ensure
431
- FileUtils.rm_f(html_file)
432
- end
433
- end
434
-
435
- def test_metadata_links_array
436
- html_content = <<~HTML
437
- <html>
438
- <head><title>Test</title></head>
439
- <body>
440
- <a href="https://example.com">External Link</a>
441
- <a href="/page">Internal Link</a>
442
- <a href="#section">Anchor Link</a>
443
- </body>
444
- </html>
445
- HTML
446
- html_file = create_test_html_file(html_content)
447
-
448
- begin
449
- result = Kreuzberg.extract_file_sync(html_file)
450
- metadata = result.metadata
451
-
452
- if metadata.is_a?(Hash) && metadata['links']
453
- assert metadata['links'].is_a?(Array)
454
- elsif metadata.is_a?(Kreuzberg::HtmlMetadata)
455
- assert_instance_of Array, metadata.links
456
- end
457
- ensure
458
- FileUtils.rm_f(html_file)
459
- end
460
- end
461
-
462
- def test_metadata_images_array
463
- html_content = <<~HTML
464
- <html>
465
- <head><title>Test</title></head>
466
- <body>
467
- <img src="image1.jpg" alt="Image 1" width="200" height="100">
468
- <img src="image2.png" alt="Image 2">
469
- <img src="image3.gif">
470
- </body>
471
- </html>
472
- HTML
473
- html_file = create_test_html_file(html_content)
474
-
475
- begin
476
- result = Kreuzberg.extract_file_sync(html_file)
477
- metadata = result.metadata
478
-
479
- if metadata.is_a?(Hash) && metadata['images']
480
- assert metadata['images'].is_a?(Array)
481
- elsif metadata.is_a?(Kreuzberg::HtmlMetadata)
482
- assert_instance_of Array, metadata.images
483
- end
484
- ensure
485
- FileUtils.rm_f(html_file)
486
- end
487
- end
488
-
489
- # ============================================================================
490
- # Edge Cases
491
- # ============================================================================
492
-
493
- def test_metadata_empty_html
494
- html_file = create_test_html_file('<html><body></body></html>')
495
-
496
- begin
497
- result = Kreuzberg.extract_file_sync(html_file)
498
- metadata = result.metadata
499
-
500
- if metadata.is_a?(Kreuzberg::HtmlMetadata)
501
- assert_instance_of Array, metadata.keywords
502
- assert_instance_of Hash, metadata.open_graph
503
- assert_instance_of Hash, metadata.twitter_card
504
- assert_instance_of Hash, metadata.meta_tags
505
- assert_instance_of Array, metadata.headers
506
- assert_instance_of Array, metadata.links
507
- assert_instance_of Array, metadata.images
508
- assert_instance_of Array, metadata.structured_data
509
- elsif metadata.is_a?(Hash)
510
- assert_instance_of Array, metadata['keywords'] || []
511
- assert_instance_of Hash, metadata['open_graph'] || {}
512
- assert_instance_of Hash, metadata['twitter_card'] || {}
513
- end
514
- ensure
515
- FileUtils.rm_f(html_file)
516
- end
517
- end
518
-
519
- def test_metadata_nil_optional_fields
520
- metadata = Kreuzberg::HtmlMetadata.new(
521
- title: nil,
522
- description: nil,
523
- author: nil,
524
- copyright: nil,
525
- keywords: [],
526
- canonical_url: nil,
527
- language: nil,
528
- text_direction: nil,
529
- mime_type: nil,
530
- charset: nil,
531
- generator: nil,
532
- viewport: nil,
533
- theme_color: nil,
534
- application_name: nil,
535
- robots: nil,
536
- open_graph: {},
537
- twitter_card: {},
538
- meta_tags: {},
539
- headers: [],
540
- links: [],
541
- images: [],
542
- structured_data: []
543
- )
544
-
545
- assert_nil metadata.title
546
- assert_nil metadata.description
547
- assert_nil metadata.author
548
- assert_nil metadata.copyright
549
- assert_nil metadata.canonical_url
550
- assert_nil metadata.language
551
- assert_nil metadata.text_direction
552
- assert_nil metadata.mime_type
553
- assert_nil metadata.charset
554
- assert_nil metadata.generator
555
- assert_nil metadata.viewport
556
- assert_nil metadata.theme_color
557
- assert_nil metadata.application_name
558
- assert_nil metadata.robots
559
- end
560
-
561
- def test_metadata_empty_collections
562
- metadata = Kreuzberg::HtmlMetadata.new(
563
- title: nil,
564
- description: nil,
565
- author: nil,
566
- copyright: nil,
567
- keywords: [],
568
- canonical_url: nil,
569
- language: nil,
570
- text_direction: nil,
571
- mime_type: nil,
572
- charset: nil,
573
- generator: nil,
574
- viewport: nil,
575
- theme_color: nil,
576
- application_name: nil,
577
- robots: nil,
578
- open_graph: {},
579
- twitter_card: {},
580
- meta_tags: {},
581
- headers: [],
582
- links: [],
583
- images: [],
584
- structured_data: []
585
- )
586
-
587
- assert_empty metadata.keywords
588
- assert_empty metadata.open_graph
589
- assert_empty metadata.twitter_card
590
- assert_empty metadata.meta_tags
591
- assert_empty metadata.headers
592
- assert_empty metadata.links
593
- assert_empty metadata.images
594
- assert_empty metadata.structured_data
595
- end
596
-
597
- # ============================================================================
598
- # Sorbet Type Safety
599
- # ============================================================================
600
-
601
- def test_type_checking_enabled
602
- metadata = Kreuzberg::HtmlMetadata.new(
603
- title: 'Test',
604
- description: nil,
605
- author: nil,
606
- copyright: nil,
607
- keywords: ['test'],
608
- canonical_url: nil,
609
- language: nil,
610
- text_direction: nil,
611
- mime_type: nil,
612
- charset: nil,
613
- generator: nil,
614
- viewport: nil,
615
- theme_color: nil,
616
- application_name: nil,
617
- robots: nil,
618
- open_graph: {},
619
- twitter_card: {},
620
- meta_tags: {},
621
- headers: [],
622
- links: [],
623
- images: [],
624
- structured_data: []
625
- )
626
-
627
- assert_kind_of Kreuzberg::HtmlMetadata, metadata
628
- assert metadata.respond_to?(:title)
629
- assert metadata.respond_to?(:keywords)
630
- assert metadata.respond_to?(:open_graph)
631
- end
632
-
633
- def test_immutable_tstruct_fields
634
- metadata = Kreuzberg::HtmlMetadata.new(
635
- title: 'Original',
636
- description: nil,
637
- author: nil,
638
- copyright: nil,
639
- keywords: [],
640
- canonical_url: nil,
641
- language: nil,
642
- text_direction: nil,
643
- mime_type: nil,
644
- charset: nil,
645
- generator: nil,
646
- viewport: nil,
647
- theme_color: nil,
648
- application_name: nil,
649
- robots: nil,
650
- open_graph: {},
651
- twitter_card: {},
652
- meta_tags: {},
653
- headers: [],
654
- links: [],
655
- images: [],
656
- structured_data: []
657
- )
658
-
659
- assert_raises(NoMethodError) { metadata.title = 'Modified' }
660
- end
661
-
662
- def test_headers_with_multiple_levels
663
- headers = [
664
- Kreuzberg::HeaderMetadata.new(level: 1, text: 'H1', id: nil, depth: 0, html_offset: 0),
665
- Kreuzberg::HeaderMetadata.new(level: 2, text: 'H2', id: nil, depth: 1, html_offset: 50),
666
- Kreuzberg::HeaderMetadata.new(level: 3, text: 'H3', id: 'sec-1', depth: 2, html_offset: 100),
667
- Kreuzberg::HeaderMetadata.new(level: 2, text: 'H2-2', id: nil, depth: 1, html_offset: 150)
668
- ]
669
-
670
- metadata = Kreuzberg::HtmlMetadata.new(
671
- title: nil,
672
- description: nil,
673
- author: nil,
674
- copyright: nil,
675
- keywords: [],
676
- canonical_url: nil,
677
- language: nil,
678
- text_direction: nil,
679
- mime_type: nil,
680
- charset: nil,
681
- generator: nil,
682
- viewport: nil,
683
- theme_color: nil,
684
- application_name: nil,
685
- robots: nil,
686
- open_graph: {},
687
- twitter_card: {},
688
- meta_tags: {},
689
- headers: headers,
690
- links: [],
691
- images: [],
692
- structured_data: []
693
- )
694
-
695
- assert_equal 4, metadata.headers.length
696
- assert_equal 1, metadata.headers[0].level
697
- assert_equal 3, metadata.headers[2].level
698
- assert_equal 'sec-1', metadata.headers[2].id
699
- end
700
-
701
- def test_links_with_various_types
702
- links = [
703
- Kreuzberg::LinkMetadata.new(
704
- href: 'https://external.com',
705
- text: 'External',
706
- title: nil,
707
- link_type: 'external',
708
- rel: ['noopener'],
709
- attributes: {}
710
- ),
711
- Kreuzberg::LinkMetadata.new(
712
- href: '/internal/page',
713
- text: 'Internal',
714
- title: 'Internal Page',
715
- link_type: 'internal',
716
- rel: [],
717
- attributes: { 'class' => 'nav-link' }
718
- ),
719
- Kreuzberg::LinkMetadata.new(
720
- href: '#section',
721
- text: 'Anchor',
722
- title: nil,
723
- link_type: 'anchor',
724
- rel: [],
725
- attributes: {}
726
- )
727
- ]
728
-
729
- metadata = Kreuzberg::HtmlMetadata.new(
730
- title: nil,
731
- description: nil,
732
- author: nil,
733
- copyright: nil,
734
- keywords: [],
735
- canonical_url: nil,
736
- language: nil,
737
- text_direction: nil,
738
- mime_type: nil,
739
- charset: nil,
740
- generator: nil,
741
- viewport: nil,
742
- theme_color: nil,
743
- application_name: nil,
744
- robots: nil,
745
- open_graph: {},
746
- twitter_card: {},
747
- meta_tags: {},
748
- headers: [],
749
- links: links,
750
- images: [],
751
- structured_data: []
752
- )
753
-
754
- assert_equal 3, metadata.links.length
755
- assert_equal 'external', metadata.links[0].link_type
756
- assert_equal 'internal', metadata.links[1].link_type
757
- assert_equal 'anchor', metadata.links[2].link_type
758
- assert_equal 'nav-link', metadata.links[1].attributes['class']
759
- end
760
-
761
- def test_images_with_attributes
762
- images = [
763
- Kreuzberg::ImageMetadata.new(
764
- src: 'logo.png',
765
- alt: 'Logo',
766
- title: nil,
767
- dimensions: [200, 100],
768
- image_type: 'png',
769
- attributes: { 'class' => 'logo', 'loading' => 'eager' }
770
- ),
771
- Kreuzberg::ImageMetadata.new(
772
- src: 'thumbnail.jpg',
773
- alt: nil,
774
- title: 'Thumbnail',
775
- dimensions: nil,
776
- image_type: 'jpg',
777
- attributes: { 'loading' => 'lazy', 'decoding' => 'async' }
778
- )
779
- ]
780
-
781
- metadata = Kreuzberg::HtmlMetadata.new(
782
- title: nil,
783
- description: nil,
784
- author: nil,
785
- copyright: nil,
786
- keywords: [],
787
- canonical_url: nil,
788
- language: nil,
789
- text_direction: nil,
790
- mime_type: nil,
791
- charset: nil,
792
- generator: nil,
793
- viewport: nil,
794
- theme_color: nil,
795
- application_name: nil,
796
- robots: nil,
797
- open_graph: {},
798
- twitter_card: {},
799
- meta_tags: {},
800
- headers: [],
801
- links: [],
802
- images: images,
803
- structured_data: []
804
- )
805
-
806
- assert_equal 2, metadata.images.length
807
- assert_equal [200, 100], metadata.images[0].dimensions
808
- assert_nil metadata.images[1].dimensions
809
- assert_equal 'lazy', metadata.images[1].attributes['loading']
810
- end
811
-
812
- def test_structured_data_multiple_types
813
- json_ld = '{"@context":"https://schema.org","@type":"Article"}'
814
- microdata = '{"type":"http://schema.org/Person"}'
815
-
816
- structured_data = [
817
- Kreuzberg::StructuredData.new(
818
- data_type: 'json-ld',
819
- raw_json: json_ld,
820
- schema_type: 'Article'
821
- ),
822
- Kreuzberg::StructuredData.new(
823
- data_type: 'microdata',
824
- raw_json: microdata,
825
- schema_type: 'Person'
826
- ),
827
- Kreuzberg::StructuredData.new(
828
- data_type: 'json-ld',
829
- raw_json: '{"@type":"Organization"}',
830
- schema_type: nil
831
- )
832
- ]
833
-
834
- metadata = Kreuzberg::HtmlMetadata.new(
835
- title: nil,
836
- description: nil,
837
- author: nil,
838
- copyright: nil,
839
- keywords: [],
840
- canonical_url: nil,
841
- language: nil,
842
- text_direction: nil,
843
- mime_type: nil,
844
- charset: nil,
845
- generator: nil,
846
- viewport: nil,
847
- theme_color: nil,
848
- application_name: nil,
849
- robots: nil,
850
- open_graph: {},
851
- twitter_card: {},
852
- meta_tags: {},
853
- headers: [],
854
- links: [],
855
- images: [],
856
- structured_data: structured_data
857
- )
858
-
859
- assert_equal 3, metadata.structured_data.length
860
- assert_equal 'json-ld', metadata.structured_data[0].data_type
861
- assert_equal 'Article', metadata.structured_data[0].schema_type
862
- assert_equal 'microdata', metadata.structured_data[1].data_type
863
- assert_nil metadata.structured_data[2].schema_type
864
- end
865
-
866
- def test_html_metadata_with_all_fields_populated
867
- headers = [
868
- Kreuzberg::HeaderMetadata.new(level: 1, text: 'Title', id: 'title', depth: 0, html_offset: 100)
869
- ]
870
- links = [
871
- Kreuzberg::LinkMetadata.new(
872
- href: 'https://example.com',
873
- text: 'Example',
874
- title: 'Example Site',
875
- link_type: 'external',
876
- rel: ['noopener'],
877
- attributes: { 'data-track' => 'true' }
878
- )
879
- ]
880
- images = [
881
- Kreuzberg::ImageMetadata.new(
882
- src: 'image.jpg',
883
- alt: 'Test Image',
884
- title: nil,
885
- dimensions: [300, 200],
886
- image_type: 'jpg',
887
- attributes: { 'loading' => 'lazy' }
888
- )
889
- ]
890
- structured = [
891
- Kreuzberg::StructuredData.new(
892
- data_type: 'json-ld',
893
- raw_json: '{"@type":"WebPage"}',
894
- schema_type: 'WebPage'
895
- )
896
- ]
897
-
898
- metadata = Kreuzberg::HtmlMetadata.new(
899
- title: 'Complete Test Page',
900
- description: 'A complete test page with all metadata',
901
- author: 'Test Author',
902
- copyright: '2024 Test Corp',
903
- keywords: %w[test comprehensive metadata],
904
- canonical_url: 'https://example.com/test',
905
- language: 'en',
906
- text_direction: 'ltr',
907
- mime_type: 'text/html; charset=utf-8',
908
- charset: 'utf-8',
909
- generator: 'Kreuzberg',
910
- viewport: 'width=device-width, initial-scale=1',
911
- theme_color: '#ffffff',
912
- application_name: 'Test App',
913
- robots: 'index, follow',
914
- open_graph: {
915
- 'og:title' => 'Test',
916
- 'og:description' => 'Description',
917
- 'og:image' => 'https://example.com/image.jpg'
918
- },
919
- twitter_card: {
920
- 'twitter:card' => 'summary_large_image',
921
- 'twitter:title' => 'Test'
922
- },
923
- meta_tags: {
924
- 'custom-tag' => 'custom-value'
925
- },
926
- headers: headers,
927
- links: links,
928
- images: images,
929
- structured_data: structured
930
- )
931
-
932
- assert_equal 'Complete Test Page', metadata.title
933
- assert_equal 'A complete test page with all metadata', metadata.description
934
- assert_equal 'Test Author', metadata.author
935
- assert_equal '2024 Test Corp', metadata.copyright
936
- assert_equal 3, metadata.keywords.length
937
- assert_equal 'https://example.com/test', metadata.canonical_url
938
- assert_equal 'en', metadata.language
939
- assert_equal 'ltr', metadata.text_direction
940
- assert_equal 'Kreuzberg', metadata.generator
941
- assert_equal 3, metadata.open_graph.length
942
- assert_equal 2, metadata.twitter_card.length
943
- assert_equal 1, metadata.meta_tags.length
944
- assert_equal 1, metadata.headers.length
945
- assert_equal 1, metadata.links.length
946
- assert_equal 1, metadata.images.length
947
- assert_equal 1, metadata.structured_data.length
948
- end
949
-
950
- private
951
-
952
- def create_test_html_file(content)
953
- file = Tempfile.new(['test', '.html'])
954
- file.write(content)
955
- file.close
956
- file.path
957
- end
958
- end
959
- # rubocop:enable Metrics/ClassLength, Metrics/MethodLength, Metrics/AbcSize