kreuzberg 4.0.0.rc2 → 4.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (446) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +14 -14
  3. data/.rspec +3 -3
  4. data/.rubocop.yaml +1 -1
  5. data/.rubocop.yml +543 -538
  6. data/Gemfile +8 -8
  7. data/Gemfile.lock +194 -6
  8. data/README.md +391 -426
  9. data/Rakefile +34 -25
  10. data/Steepfile +51 -47
  11. data/examples/async_patterns.rb +283 -341
  12. data/ext/kreuzberg_rb/extconf.rb +65 -45
  13. data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
  14. data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
  15. data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
  16. data/ext/kreuzberg_rb/native/README.md +425 -425
  17. data/ext/kreuzberg_rb/native/build.rs +15 -15
  18. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
  19. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
  20. data/ext/kreuzberg_rb/native/include/strings.h +20 -20
  21. data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
  22. data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
  23. data/extconf.rb +60 -28
  24. data/kreuzberg.gemspec +199 -148
  25. data/lib/kreuzberg/api_proxy.rb +126 -142
  26. data/lib/kreuzberg/cache_api.rb +67 -46
  27. data/lib/kreuzberg/cli.rb +47 -55
  28. data/lib/kreuzberg/cli_proxy.rb +117 -127
  29. data/lib/kreuzberg/config.rb +936 -691
  30. data/lib/kreuzberg/error_context.rb +136 -32
  31. data/lib/kreuzberg/errors.rb +116 -118
  32. data/lib/kreuzberg/extraction_api.rb +313 -85
  33. data/lib/kreuzberg/mcp_proxy.rb +177 -186
  34. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
  35. data/lib/kreuzberg/post_processor_protocol.rb +15 -86
  36. data/lib/kreuzberg/result.rb +334 -216
  37. data/lib/kreuzberg/setup_lib_path.rb +99 -80
  38. data/lib/kreuzberg/types.rb +170 -0
  39. data/lib/kreuzberg/validator_protocol.rb +16 -89
  40. data/lib/kreuzberg/version.rb +5 -5
  41. data/lib/kreuzberg.rb +96 -103
  42. data/lib/libpdfium.so +0 -0
  43. data/sig/kreuzberg/internal.rbs +184 -184
  44. data/sig/kreuzberg.rbs +561 -520
  45. data/spec/binding/async_operations_spec.rb +473 -0
  46. data/spec/binding/batch_operations_spec.rb +595 -0
  47. data/spec/binding/batch_spec.rb +359 -0
  48. data/spec/binding/cache_spec.rb +227 -227
  49. data/spec/binding/cli_proxy_spec.rb +85 -85
  50. data/spec/binding/cli_spec.rb +55 -55
  51. data/spec/binding/config_result_spec.rb +377 -0
  52. data/spec/binding/config_spec.rb +419 -345
  53. data/spec/binding/config_validation_spec.rb +377 -283
  54. data/spec/binding/embeddings_spec.rb +816 -0
  55. data/spec/binding/error_handling_spec.rb +399 -213
  56. data/spec/binding/error_recovery_spec.rb +488 -0
  57. data/spec/binding/errors_spec.rb +66 -66
  58. data/spec/binding/font_config_spec.rb +220 -0
  59. data/spec/binding/images_spec.rb +738 -0
  60. data/spec/binding/keywords_extraction_spec.rb +600 -0
  61. data/spec/binding/metadata_types_spec.rb +1228 -0
  62. data/spec/binding/pages_extraction_spec.rb +471 -0
  63. data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
  64. data/spec/binding/plugins/postprocessor_spec.rb +269 -269
  65. data/spec/binding/plugins/validator_spec.rb +273 -274
  66. data/spec/binding/tables_spec.rb +641 -0
  67. data/spec/fixtures/config.toml +38 -39
  68. data/spec/fixtures/config.yaml +41 -41
  69. data/spec/fixtures/invalid_config.toml +3 -4
  70. data/spec/smoke/package_spec.rb +177 -178
  71. data/spec/spec_helper.rb +40 -42
  72. data/spec/unit/config/chunking_config_spec.rb +213 -0
  73. data/spec/unit/config/embedding_config_spec.rb +343 -0
  74. data/spec/unit/config/extraction_config_spec.rb +438 -0
  75. data/spec/unit/config/font_config_spec.rb +285 -0
  76. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  77. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  78. data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
  79. data/spec/unit/config/keyword_config_spec.rb +229 -0
  80. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  81. data/spec/unit/config/ocr_config_spec.rb +171 -0
  82. data/spec/unit/config/page_config_spec.rb +221 -0
  83. data/spec/unit/config/pdf_config_spec.rb +267 -0
  84. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  85. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  86. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  87. data/test/metadata_types_test.rb +959 -0
  88. data/vendor/Cargo.toml +61 -0
  89. data/vendor/kreuzberg/Cargo.toml +259 -204
  90. data/vendor/kreuzberg/README.md +263 -175
  91. data/vendor/kreuzberg/build.rs +782 -474
  92. data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
  93. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
  94. data/vendor/kreuzberg/src/api/error.rs +81 -81
  95. data/vendor/kreuzberg/src/api/handlers.rs +320 -199
  96. data/vendor/kreuzberg/src/api/mod.rs +94 -79
  97. data/vendor/kreuzberg/src/api/server.rs +518 -353
  98. data/vendor/kreuzberg/src/api/types.rs +206 -170
  99. data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
  100. data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
  101. data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
  102. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
  103. data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
  104. data/vendor/kreuzberg/src/core/config.rs +1914 -1032
  105. data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
  106. data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
  107. data/vendor/kreuzberg/src/core/formats.rs +235 -0
  108. data/vendor/kreuzberg/src/core/io.rs +329 -329
  109. data/vendor/kreuzberg/src/core/mime.rs +605 -605
  110. data/vendor/kreuzberg/src/core/mod.rs +61 -45
  111. data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
  112. data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
  113. data/vendor/kreuzberg/src/embeddings.rs +471 -432
  114. data/vendor/kreuzberg/src/error.rs +431 -431
  115. data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
  116. data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
  117. data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
  118. data/vendor/kreuzberg/src/extraction/email.rs +855 -854
  119. data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
  120. data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
  121. data/vendor/kreuzberg/src/extraction/image.rs +492 -368
  122. data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
  123. data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
  124. data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
  125. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
  126. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
  127. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
  128. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
  129. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
  130. data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
  131. data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
  132. data/vendor/kreuzberg/src/extraction/table.rs +329 -328
  133. data/vendor/kreuzberg/src/extraction/text.rs +277 -269
  134. data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
  135. data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
  136. data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
  137. data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
  138. data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
  139. data/vendor/kreuzberg/src/extractors/email.rs +157 -143
  140. data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
  141. data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
  142. data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
  143. data/vendor/kreuzberg/src/extractors/html.rs +419 -393
  144. data/vendor/kreuzberg/src/extractors/image.rs +219 -198
  145. data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
  146. data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
  147. data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
  149. data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
  150. data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
  151. data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
  152. data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
  153. data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
  154. data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
  155. data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
  156. data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
  157. data/vendor/kreuzberg/src/extractors/security.rs +484 -484
  158. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
  159. data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
  160. data/vendor/kreuzberg/src/extractors/text.rs +265 -260
  161. data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
  162. data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
  163. data/vendor/kreuzberg/src/image/dpi.rs +164 -164
  164. data/vendor/kreuzberg/src/image/mod.rs +6 -6
  165. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
  166. data/vendor/kreuzberg/src/image/resize.rs +89 -89
  167. data/vendor/kreuzberg/src/keywords/config.rs +154 -154
  168. data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
  169. data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
  170. data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
  171. data/vendor/kreuzberg/src/keywords/types.rs +68 -68
  172. data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
  173. data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
  174. data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
  175. data/vendor/kreuzberg/src/lib.rs +114 -105
  176. data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
  177. data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
  178. data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
  179. data/vendor/kreuzberg/src/ocr/error.rs +37 -37
  180. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
  181. data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
  182. data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
  183. data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
  184. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
  185. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
  186. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
  187. data/vendor/kreuzberg/src/ocr/types.rs +393 -393
  188. data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
  189. data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
  190. data/vendor/kreuzberg/src/panic_context.rs +154 -154
  191. data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
  192. data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
  193. data/vendor/kreuzberg/src/pdf/error.rs +214 -122
  194. data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
  196. data/vendor/kreuzberg/src/pdf/images.rs +139 -139
  197. data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
  198. data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
  199. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
  200. data/vendor/kreuzberg/src/pdf/table.rs +417 -393
  201. data/vendor/kreuzberg/src/pdf/text.rs +553 -158
  202. data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
  203. data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
  204. data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
  205. data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
  206. data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
  207. data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
  208. data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
  209. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
  210. data/vendor/kreuzberg/src/text/mod.rs +27 -19
  211. data/vendor/kreuzberg/src/text/quality.rs +710 -697
  212. data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
  213. data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
  214. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
  215. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
  216. data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
  217. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
  218. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
  219. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
  220. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
  221. data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
  222. data/vendor/kreuzberg/src/types.rs +1713 -903
  223. data/vendor/kreuzberg/src/utils/mod.rs +31 -17
  224. data/vendor/kreuzberg/src/utils/pool.rs +503 -0
  225. data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
  226. data/vendor/kreuzberg/src/utils/quality.rs +968 -959
  227. data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
  228. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
  229. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
  230. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
  231. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
  232. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
  233. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
  234. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
  235. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
  236. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
  237. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
  238. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
  239. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
  240. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
  241. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
  242. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
  243. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
  244. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
  245. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
  246. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
  247. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
  248. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
  249. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
  250. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
  251. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
  252. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
  253. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
  254. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
  255. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
  256. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
  257. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
  258. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
  259. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
  260. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
  261. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
  262. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
  263. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
  264. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
  265. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
  266. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
  267. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
  268. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
  269. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
  270. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
  271. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
  272. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
  273. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
  274. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
  275. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
  276. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
  277. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
  278. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
  279. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
  280. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
  281. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
  282. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
  283. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
  284. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
  285. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
  286. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
  287. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
  288. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
  289. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
  290. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
  291. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
  292. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
  293. data/vendor/kreuzberg/tests/api_embed.rs +360 -0
  294. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
  295. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
  296. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
  297. data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
  298. data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
  299. data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
  300. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
  301. data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
  302. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
  303. data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
  304. data/vendor/kreuzberg/tests/config_features.rs +612 -598
  305. data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
  306. data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
  307. data/vendor/kreuzberg/tests/core_integration.rs +519 -510
  308. data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
  309. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
  310. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
  311. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
  312. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
  313. data/vendor/kreuzberg/tests/email_integration.rs +327 -325
  314. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
  315. data/vendor/kreuzberg/tests/error_handling.rs +402 -393
  316. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
  317. data/vendor/kreuzberg/tests/format_integration.rs +165 -159
  318. data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
  319. data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
  320. data/vendor/kreuzberg/tests/image_integration.rs +255 -253
  321. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
  322. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
  323. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
  324. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
  325. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
  326. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
  327. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
  328. data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
  329. data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
  330. data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
  331. data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
  332. data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
  333. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
  334. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
  335. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
  336. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
  337. data/vendor/kreuzberg/tests/page_markers.rs +297 -0
  338. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
  339. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
  340. data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
  341. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
  342. data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
  343. data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
  344. data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
  345. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
  346. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
  347. data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
  348. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
  349. data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
  350. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
  351. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
  352. data/vendor/kreuzberg/tests/security_validation.rs +416 -415
  353. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
  354. data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
  355. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
  356. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
  357. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
  358. data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
  359. data/vendor/kreuzberg-ffi/README.md +851 -0
  360. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
  361. data/vendor/kreuzberg-ffi/build.rs +168 -0
  362. data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
  363. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
  364. data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
  365. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
  366. data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
  367. data/vendor/kreuzberg-ffi/src/error.rs +901 -0
  368. data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
  369. data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
  370. data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
  371. data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
  372. data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
  373. data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
  374. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
  375. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
  376. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
  377. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
  378. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
  379. data/vendor/kreuzberg-ffi/src/result.rs +510 -0
  380. data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
  381. data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
  382. data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
  383. data/vendor/kreuzberg-ffi/src/types.rs +363 -0
  384. data/vendor/kreuzberg-ffi/src/util.rs +210 -0
  385. data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
  386. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
  387. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
  388. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
  389. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
  390. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
  391. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
  392. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
  393. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
  394. data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
  395. data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
  396. data/vendor/kreuzberg-tesseract/README.md +399 -0
  397. data/vendor/kreuzberg-tesseract/build.rs +1127 -0
  398. data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
  399. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
  400. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
  401. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
  402. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
  403. data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
  404. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
  405. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
  406. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
  407. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
  408. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
  409. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
  410. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
  411. metadata +196 -45
  412. data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
  413. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  414. data/vendor/rb-sys/.cargo-ok +0 -1
  415. data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
  416. data/vendor/rb-sys/Cargo.lock +0 -393
  417. data/vendor/rb-sys/Cargo.toml +0 -70
  418. data/vendor/rb-sys/Cargo.toml.orig +0 -57
  419. data/vendor/rb-sys/LICENSE-APACHE +0 -190
  420. data/vendor/rb-sys/bin/release.sh +0 -21
  421. data/vendor/rb-sys/build/features.rs +0 -108
  422. data/vendor/rb-sys/build/main.rs +0 -246
  423. data/vendor/rb-sys/build/stable_api_config.rs +0 -153
  424. data/vendor/rb-sys/build/version.rs +0 -48
  425. data/vendor/rb-sys/readme.md +0 -36
  426. data/vendor/rb-sys/src/bindings.rs +0 -21
  427. data/vendor/rb-sys/src/hidden.rs +0 -11
  428. data/vendor/rb-sys/src/lib.rs +0 -34
  429. data/vendor/rb-sys/src/macros.rs +0 -371
  430. data/vendor/rb-sys/src/memory.rs +0 -53
  431. data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
  432. data/vendor/rb-sys/src/special_consts.rs +0 -31
  433. data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
  434. data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
  435. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
  436. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
  437. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
  438. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
  439. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
  440. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
  441. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
  442. data/vendor/rb-sys/src/stable_api.rs +0 -261
  443. data/vendor/rb-sys/src/symbol.rs +0 -31
  444. data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
  445. data/vendor/rb-sys/src/utils.rs +0 -89
  446. data/vendor/rb-sys/src/value_type.rs +0 -7
@@ -0,0 +1,959 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'minitest/autorun'
4
+ require 'kreuzberg'
5
+ require 'json'
6
+ require 'tempfile'
7
+
8
+ # Comprehensive tests for Kreuzberg metadata types
9
+ # Tests verify T::Struct behavior, type safety, and integration with extraction
10
+ # rubocop:disable Metrics/ClassLength, Metrics/MethodLength, Metrics/AbcSize
11
+ class MetadataTypesTest < Minitest::Test
12
+ def test_html_metadata_structure
13
+ metadata = Kreuzberg::HtmlMetadata.new(
14
+ title: 'Test Page',
15
+ description: 'A test description',
16
+ author: 'Test Author',
17
+ copyright: '2024 Test Corp',
18
+ keywords: %w[test metadata],
19
+ canonical_url: 'https://example.com/test',
20
+ language: 'en',
21
+ text_direction: 'ltr',
22
+ mime_type: 'text/html',
23
+ charset: 'utf-8',
24
+ generator: 'Kreuzberg',
25
+ viewport: 'width=device-width, initial-scale=1',
26
+ theme_color: '#ffffff',
27
+ application_name: 'Test App',
28
+ robots: 'index, follow',
29
+ open_graph: { 'og:title' => 'Test', 'og:image' => 'image.jpg' },
30
+ twitter_card: { 'twitter:card' => 'summary' },
31
+ meta_tags: { 'custom' => 'value' },
32
+ headers: [],
33
+ links: [],
34
+ images: [],
35
+ structured_data: []
36
+ )
37
+
38
+ assert_equal 'Test Page', metadata.title
39
+ assert_equal 'A test description', metadata.description
40
+ assert_equal 'Test Author', metadata.author
41
+ assert_equal '2024 Test Corp', metadata.copyright
42
+ assert_equal 'https://example.com/test', metadata.canonical_url
43
+ assert_equal 'en', metadata.language
44
+ assert_equal 'ltr', metadata.text_direction
45
+ assert_equal 'text/html', metadata.mime_type
46
+ assert_equal 'utf-8', metadata.charset
47
+ assert_equal 'Kreuzberg', metadata.generator
48
+ assert_equal '#ffffff', metadata.theme_color
49
+ assert_equal 'Test App', metadata.application_name
50
+ assert_equal 'index, follow', metadata.robots
51
+ end
52
+
53
+ def test_keywords_is_array
54
+ keywords_array = %w[test metadata array]
55
+ metadata = Kreuzberg::HtmlMetadata.new(
56
+ title: nil,
57
+ description: nil,
58
+ author: nil,
59
+ copyright: nil,
60
+ keywords: keywords_array,
61
+ canonical_url: nil,
62
+ language: nil,
63
+ text_direction: nil,
64
+ mime_type: nil,
65
+ charset: nil,
66
+ generator: nil,
67
+ viewport: nil,
68
+ theme_color: nil,
69
+ application_name: nil,
70
+ robots: nil,
71
+ open_graph: {},
72
+ twitter_card: {},
73
+ meta_tags: {},
74
+ headers: [],
75
+ links: [],
76
+ images: [],
77
+ structured_data: []
78
+ )
79
+
80
+ assert_instance_of Array, metadata.keywords
81
+ assert_equal keywords_array, metadata.keywords
82
+ metadata.keywords.each { |keyword| assert_instance_of String, keyword }
83
+ end
84
+
85
+ def test_canonical_url_renamed
86
+ metadata = Kreuzberg::HtmlMetadata.new(
87
+ title: nil,
88
+ description: nil,
89
+ author: nil,
90
+ copyright: nil,
91
+ keywords: [],
92
+ canonical_url: 'https://example.com/canonical',
93
+ language: nil,
94
+ text_direction: nil,
95
+ mime_type: nil,
96
+ charset: nil,
97
+ generator: nil,
98
+ viewport: nil,
99
+ theme_color: nil,
100
+ application_name: nil,
101
+ robots: nil,
102
+ open_graph: {},
103
+ twitter_card: {},
104
+ meta_tags: {},
105
+ headers: [],
106
+ links: [],
107
+ images: [],
108
+ structured_data: []
109
+ )
110
+
111
+ assert_equal 'https://example.com/canonical', metadata.canonical_url
112
+ assert_respond_to metadata, :canonical_url
113
+ end
114
+
115
+ def test_open_graph_is_hash
116
+ og_tags = {
117
+ 'og:title' => 'Test Title',
118
+ 'og:description' => 'Test Description',
119
+ 'og:image' => 'https://example.com/image.jpg',
120
+ 'og:url' => 'https://example.com'
121
+ }
122
+ metadata = Kreuzberg::HtmlMetadata.new(
123
+ title: nil,
124
+ description: nil,
125
+ author: nil,
126
+ copyright: nil,
127
+ keywords: [],
128
+ canonical_url: nil,
129
+ language: nil,
130
+ text_direction: nil,
131
+ mime_type: nil,
132
+ charset: nil,
133
+ generator: nil,
134
+ viewport: nil,
135
+ theme_color: nil,
136
+ application_name: nil,
137
+ robots: nil,
138
+ open_graph: og_tags,
139
+ twitter_card: {},
140
+ meta_tags: {},
141
+ headers: [],
142
+ links: [],
143
+ images: [],
144
+ structured_data: []
145
+ )
146
+
147
+ assert_instance_of Hash, metadata.open_graph
148
+ assert_equal og_tags, metadata.open_graph
149
+ metadata.open_graph.each do |key, value|
150
+ assert_instance_of String, key
151
+ assert_instance_of String, value
152
+ end
153
+ end
154
+
155
+ def test_twitter_card_is_hash
156
+ twitter_tags = {
157
+ 'twitter:card' => 'summary_large_image',
158
+ 'twitter:title' => 'Test',
159
+ 'twitter:description' => 'Description',
160
+ 'twitter:image' => 'https://example.com/image.jpg'
161
+ }
162
+ metadata = Kreuzberg::HtmlMetadata.new(
163
+ title: nil,
164
+ description: nil,
165
+ author: nil,
166
+ copyright: nil,
167
+ keywords: [],
168
+ canonical_url: nil,
169
+ language: nil,
170
+ text_direction: nil,
171
+ mime_type: nil,
172
+ charset: nil,
173
+ generator: nil,
174
+ viewport: nil,
175
+ theme_color: nil,
176
+ application_name: nil,
177
+ robots: nil,
178
+ open_graph: {},
179
+ twitter_card: twitter_tags,
180
+ meta_tags: {},
181
+ headers: [],
182
+ links: [],
183
+ images: [],
184
+ structured_data: []
185
+ )
186
+
187
+ assert_instance_of Hash, metadata.twitter_card
188
+ assert_equal twitter_tags, metadata.twitter_card
189
+ metadata.twitter_card.each do |key, value|
190
+ assert_instance_of String, key
191
+ assert_instance_of String, value
192
+ end
193
+ end
194
+
195
+ # ============================================================================
196
+ # T::Struct Behavior Tests
197
+ # ============================================================================
198
+
199
+ def test_header_metadata_creation
200
+ header = Kreuzberg::HeaderMetadata.new(
201
+ level: 1,
202
+ text: 'Main Title',
203
+ id: 'main-title',
204
+ depth: 0,
205
+ html_offset: 245
206
+ )
207
+
208
+ assert_equal 1, header.level
209
+ assert_equal 'Main Title', header.text
210
+ assert_equal 'main-title', header.id
211
+ assert_equal 0, header.depth
212
+ assert_equal 245, header.html_offset
213
+ end
214
+
215
+ def test_header_metadata_nil_id
216
+ header = Kreuzberg::HeaderMetadata.new(
217
+ level: 2,
218
+ text: 'Subtitle',
219
+ id: nil,
220
+ depth: 1,
221
+ html_offset: 456
222
+ )
223
+
224
+ assert_equal 2, header.level
225
+ assert_equal 'Subtitle', header.text
226
+ assert_nil header.id
227
+ assert_equal 1, header.depth
228
+ assert_equal 456, header.html_offset
229
+ end
230
+
231
+ def test_link_metadata_creation
232
+ link = Kreuzberg::LinkMetadata.new(
233
+ href: 'https://example.com',
234
+ text: 'Example',
235
+ title: 'Example Site',
236
+ link_type: 'external',
237
+ rel: %w[noopener noreferrer],
238
+ attributes: { 'data-id' => '123', 'class' => 'external-link' }
239
+ )
240
+
241
+ assert_equal 'https://example.com', link.href
242
+ assert_equal 'Example', link.text
243
+ assert_equal 'Example Site', link.title
244
+ assert_equal 'external', link.link_type
245
+ assert_instance_of Array, link.rel
246
+ assert_equal %w[noopener noreferrer], link.rel
247
+ assert_instance_of Hash, link.attributes
248
+ assert_equal '123', link.attributes['data-id']
249
+ assert_equal 'external-link', link.attributes['class']
250
+ end
251
+
252
+ def test_link_metadata_empty_arrays_and_hashes
253
+ link = Kreuzberg::LinkMetadata.new(
254
+ href: 'https://example.com',
255
+ text: 'Link',
256
+ title: nil,
257
+ link_type: 'internal',
258
+ rel: [],
259
+ attributes: {}
260
+ )
261
+
262
+ assert_equal 'https://example.com', link.href
263
+ assert_empty link.rel
264
+ assert_empty link.attributes
265
+ assert_nil link.title
266
+ end
267
+
268
+ def test_image_metadata_creation
269
+ image = Kreuzberg::ImageMetadata.new(
270
+ src: 'images/logo.png',
271
+ alt: 'Company Logo',
272
+ title: nil,
273
+ dimensions: [200, 100],
274
+ image_type: 'png',
275
+ attributes: { 'loading' => 'lazy', 'class' => 'logo' }
276
+ )
277
+
278
+ assert_equal 'images/logo.png', image.src
279
+ assert_equal 'Company Logo', image.alt
280
+ assert_nil image.title
281
+ assert_instance_of Array, image.dimensions
282
+ assert_equal [200, 100], image.dimensions
283
+ assert_equal 'png', image.image_type
284
+ assert_instance_of Hash, image.attributes
285
+ assert_equal 'lazy', image.attributes['loading']
286
+ end
287
+
288
+ def test_image_metadata_nil_dimensions
289
+ image = Kreuzberg::ImageMetadata.new(
290
+ src: 'image.jpg',
291
+ alt: 'Description',
292
+ title: 'Title',
293
+ dimensions: nil,
294
+ image_type: 'jpg',
295
+ attributes: {}
296
+ )
297
+
298
+ assert_equal 'image.jpg', image.src
299
+ assert_nil image.dimensions
300
+ assert_equal 'jpg', image.image_type
301
+ end
302
+
303
+ def test_structured_data_creation
304
+ json_data = '{"@context":"https://schema.org","@type":"Article","headline":"Test Article"}'
305
+ structured = Kreuzberg::StructuredData.new(
306
+ data_type: 'json-ld',
307
+ raw_json: json_data,
308
+ schema_type: 'Article'
309
+ )
310
+
311
+ assert_equal 'json-ld', structured.data_type
312
+ assert_equal json_data, structured.raw_json
313
+ assert_equal 'Article', structured.schema_type
314
+ parsed = JSON.parse(structured.raw_json)
315
+ assert_equal 'Article', parsed['@type']
316
+ end
317
+
318
+ def test_structured_data_nil_schema_type
319
+ json_data = '{"data":"value"}'
320
+ structured = Kreuzberg::StructuredData.new(
321
+ data_type: 'microdata',
322
+ raw_json: json_data,
323
+ schema_type: nil
324
+ )
325
+
326
+ assert_equal 'microdata', structured.data_type
327
+ assert_nil structured.schema_type
328
+ end
329
+
330
+ # ============================================================================
331
+ # Integration Tests
332
+ # ============================================================================
333
+
334
+ def test_extract_html_returns_metadata
335
+ html_file = create_test_html_file(
336
+ '<html><head><title>Test Page</title></head><body><p>Content</p></body></html>'
337
+ )
338
+
339
+ begin
340
+ result = Kreuzberg.extract_file_sync(html_file)
341
+ assert_instance_of Kreuzberg::Result, result
342
+ assert_not_nil result.metadata
343
+
344
+ if result.metadata.is_a?(Hash)
345
+ assert result.metadata.is_a?(Hash)
346
+ elsif result.metadata.is_a?(Kreuzberg::HtmlMetadata)
347
+ assert result.metadata.is_a?(Kreuzberg::HtmlMetadata)
348
+ end
349
+ ensure
350
+ FileUtils.rm_f(html_file)
351
+ end
352
+ end
353
+
354
+ def test_metadata_keywords_array
355
+ html_content = <<~HTML
356
+ <html>
357
+ <head>
358
+ <title>Test</title>
359
+ <meta name="keywords" content="ruby, testing, metadata">
360
+ </head>
361
+ <body></body>
362
+ </html>
363
+ HTML
364
+ html_file = create_test_html_file(html_content)
365
+
366
+ begin
367
+ result = Kreuzberg.extract_file_sync(html_file)
368
+ metadata = result.metadata
369
+
370
+ if metadata.is_a?(Hash) && metadata['keywords']
371
+ assert metadata['keywords'].is_a?(Array)
372
+ elsif metadata.is_a?(Kreuzberg::HtmlMetadata)
373
+ assert_instance_of Array, metadata.keywords
374
+ end
375
+ ensure
376
+ FileUtils.rm_f(html_file)
377
+ end
378
+ end
379
+
380
+ def test_metadata_open_graph_hash
381
+ html_content = <<~HTML
382
+ <html>
383
+ <head>
384
+ <title>Test</title>
385
+ <meta property="og:title" content="Test Title">
386
+ <meta property="og:description" content="Test Description">
387
+ <meta property="og:image" content="https://example.com/image.jpg">
388
+ </head>
389
+ <body></body>
390
+ </html>
391
+ HTML
392
+ html_file = create_test_html_file(html_content)
393
+
394
+ begin
395
+ result = Kreuzberg.extract_file_sync(html_file)
396
+ metadata = result.metadata
397
+
398
+ if metadata.is_a?(Hash) && metadata['open_graph']
399
+ assert metadata['open_graph'].is_a?(Hash)
400
+ elsif metadata.is_a?(Kreuzberg::HtmlMetadata)
401
+ assert_instance_of Hash, metadata.open_graph
402
+ end
403
+ ensure
404
+ FileUtils.rm_f(html_file)
405
+ end
406
+ end
407
+
408
+ def test_metadata_headers_array
409
+ html_content = <<~HTML
410
+ <html>
411
+ <head><title>Test</title></head>
412
+ <body>
413
+ <h1>Main Title</h1>
414
+ <h2>Subtitle</h2>
415
+ <h3 id="section-1">Section 1</h3>
416
+ </body>
417
+ </html>
418
+ HTML
419
+ html_file = create_test_html_file(html_content)
420
+
421
+ begin
422
+ result = Kreuzberg.extract_file_sync(html_file)
423
+ metadata = result.metadata
424
+
425
+ if metadata.is_a?(Hash) && metadata['headers']
426
+ assert metadata['headers'].is_a?(Array)
427
+ elsif metadata.is_a?(Kreuzberg::HtmlMetadata)
428
+ assert_instance_of Array, metadata.headers
429
+ end
430
+ ensure
431
+ FileUtils.rm_f(html_file)
432
+ end
433
+ end
434
+
435
+ def test_metadata_links_array
436
+ html_content = <<~HTML
437
+ <html>
438
+ <head><title>Test</title></head>
439
+ <body>
440
+ <a href="https://example.com">External Link</a>
441
+ <a href="/page">Internal Link</a>
442
+ <a href="#section">Anchor Link</a>
443
+ </body>
444
+ </html>
445
+ HTML
446
+ html_file = create_test_html_file(html_content)
447
+
448
+ begin
449
+ result = Kreuzberg.extract_file_sync(html_file)
450
+ metadata = result.metadata
451
+
452
+ if metadata.is_a?(Hash) && metadata['links']
453
+ assert metadata['links'].is_a?(Array)
454
+ elsif metadata.is_a?(Kreuzberg::HtmlMetadata)
455
+ assert_instance_of Array, metadata.links
456
+ end
457
+ ensure
458
+ FileUtils.rm_f(html_file)
459
+ end
460
+ end
461
+
462
+ def test_metadata_images_array
463
+ html_content = <<~HTML
464
+ <html>
465
+ <head><title>Test</title></head>
466
+ <body>
467
+ <img src="image1.jpg" alt="Image 1" width="200" height="100">
468
+ <img src="image2.png" alt="Image 2">
469
+ <img src="image3.gif">
470
+ </body>
471
+ </html>
472
+ HTML
473
+ html_file = create_test_html_file(html_content)
474
+
475
+ begin
476
+ result = Kreuzberg.extract_file_sync(html_file)
477
+ metadata = result.metadata
478
+
479
+ if metadata.is_a?(Hash) && metadata['images']
480
+ assert metadata['images'].is_a?(Array)
481
+ elsif metadata.is_a?(Kreuzberg::HtmlMetadata)
482
+ assert_instance_of Array, metadata.images
483
+ end
484
+ ensure
485
+ FileUtils.rm_f(html_file)
486
+ end
487
+ end
488
+
489
+ # ============================================================================
490
+ # Edge Cases
491
+ # ============================================================================
492
+
493
+ def test_metadata_empty_html
494
+ html_file = create_test_html_file('<html><body></body></html>')
495
+
496
+ begin
497
+ result = Kreuzberg.extract_file_sync(html_file)
498
+ metadata = result.metadata
499
+
500
+ if metadata.is_a?(Kreuzberg::HtmlMetadata)
501
+ assert_instance_of Array, metadata.keywords
502
+ assert_instance_of Hash, metadata.open_graph
503
+ assert_instance_of Hash, metadata.twitter_card
504
+ assert_instance_of Hash, metadata.meta_tags
505
+ assert_instance_of Array, metadata.headers
506
+ assert_instance_of Array, metadata.links
507
+ assert_instance_of Array, metadata.images
508
+ assert_instance_of Array, metadata.structured_data
509
+ elsif metadata.is_a?(Hash)
510
+ assert_instance_of Array, metadata['keywords'] || []
511
+ assert_instance_of Hash, metadata['open_graph'] || {}
512
+ assert_instance_of Hash, metadata['twitter_card'] || {}
513
+ end
514
+ ensure
515
+ FileUtils.rm_f(html_file)
516
+ end
517
+ end
518
+
519
+ def test_metadata_nil_optional_fields
520
+ metadata = Kreuzberg::HtmlMetadata.new(
521
+ title: nil,
522
+ description: nil,
523
+ author: nil,
524
+ copyright: nil,
525
+ keywords: [],
526
+ canonical_url: nil,
527
+ language: nil,
528
+ text_direction: nil,
529
+ mime_type: nil,
530
+ charset: nil,
531
+ generator: nil,
532
+ viewport: nil,
533
+ theme_color: nil,
534
+ application_name: nil,
535
+ robots: nil,
536
+ open_graph: {},
537
+ twitter_card: {},
538
+ meta_tags: {},
539
+ headers: [],
540
+ links: [],
541
+ images: [],
542
+ structured_data: []
543
+ )
544
+
545
+ assert_nil metadata.title
546
+ assert_nil metadata.description
547
+ assert_nil metadata.author
548
+ assert_nil metadata.copyright
549
+ assert_nil metadata.canonical_url
550
+ assert_nil metadata.language
551
+ assert_nil metadata.text_direction
552
+ assert_nil metadata.mime_type
553
+ assert_nil metadata.charset
554
+ assert_nil metadata.generator
555
+ assert_nil metadata.viewport
556
+ assert_nil metadata.theme_color
557
+ assert_nil metadata.application_name
558
+ assert_nil metadata.robots
559
+ end
560
+
561
+ def test_metadata_empty_collections
562
+ metadata = Kreuzberg::HtmlMetadata.new(
563
+ title: nil,
564
+ description: nil,
565
+ author: nil,
566
+ copyright: nil,
567
+ keywords: [],
568
+ canonical_url: nil,
569
+ language: nil,
570
+ text_direction: nil,
571
+ mime_type: nil,
572
+ charset: nil,
573
+ generator: nil,
574
+ viewport: nil,
575
+ theme_color: nil,
576
+ application_name: nil,
577
+ robots: nil,
578
+ open_graph: {},
579
+ twitter_card: {},
580
+ meta_tags: {},
581
+ headers: [],
582
+ links: [],
583
+ images: [],
584
+ structured_data: []
585
+ )
586
+
587
+ assert_empty metadata.keywords
588
+ assert_empty metadata.open_graph
589
+ assert_empty metadata.twitter_card
590
+ assert_empty metadata.meta_tags
591
+ assert_empty metadata.headers
592
+ assert_empty metadata.links
593
+ assert_empty metadata.images
594
+ assert_empty metadata.structured_data
595
+ end
596
+
597
+ # ============================================================================
598
+ # Sorbet Type Safety
599
+ # ============================================================================
600
+
601
+ def test_type_checking_enabled
602
+ metadata = Kreuzberg::HtmlMetadata.new(
603
+ title: 'Test',
604
+ description: nil,
605
+ author: nil,
606
+ copyright: nil,
607
+ keywords: ['test'],
608
+ canonical_url: nil,
609
+ language: nil,
610
+ text_direction: nil,
611
+ mime_type: nil,
612
+ charset: nil,
613
+ generator: nil,
614
+ viewport: nil,
615
+ theme_color: nil,
616
+ application_name: nil,
617
+ robots: nil,
618
+ open_graph: {},
619
+ twitter_card: {},
620
+ meta_tags: {},
621
+ headers: [],
622
+ links: [],
623
+ images: [],
624
+ structured_data: []
625
+ )
626
+
627
+ assert_kind_of Kreuzberg::HtmlMetadata, metadata
628
+ assert metadata.respond_to?(:title)
629
+ assert metadata.respond_to?(:keywords)
630
+ assert metadata.respond_to?(:open_graph)
631
+ end
632
+
633
+ def test_immutable_tstruct_fields
634
+ metadata = Kreuzberg::HtmlMetadata.new(
635
+ title: 'Original',
636
+ description: nil,
637
+ author: nil,
638
+ copyright: nil,
639
+ keywords: [],
640
+ canonical_url: nil,
641
+ language: nil,
642
+ text_direction: nil,
643
+ mime_type: nil,
644
+ charset: nil,
645
+ generator: nil,
646
+ viewport: nil,
647
+ theme_color: nil,
648
+ application_name: nil,
649
+ robots: nil,
650
+ open_graph: {},
651
+ twitter_card: {},
652
+ meta_tags: {},
653
+ headers: [],
654
+ links: [],
655
+ images: [],
656
+ structured_data: []
657
+ )
658
+
659
+ assert_raises(NoMethodError) { metadata.title = 'Modified' }
660
+ end
661
+
662
+ def test_headers_with_multiple_levels
663
+ headers = [
664
+ Kreuzberg::HeaderMetadata.new(level: 1, text: 'H1', id: nil, depth: 0, html_offset: 0),
665
+ Kreuzberg::HeaderMetadata.new(level: 2, text: 'H2', id: nil, depth: 1, html_offset: 50),
666
+ Kreuzberg::HeaderMetadata.new(level: 3, text: 'H3', id: 'sec-1', depth: 2, html_offset: 100),
667
+ Kreuzberg::HeaderMetadata.new(level: 2, text: 'H2-2', id: nil, depth: 1, html_offset: 150)
668
+ ]
669
+
670
+ metadata = Kreuzberg::HtmlMetadata.new(
671
+ title: nil,
672
+ description: nil,
673
+ author: nil,
674
+ copyright: nil,
675
+ keywords: [],
676
+ canonical_url: nil,
677
+ language: nil,
678
+ text_direction: nil,
679
+ mime_type: nil,
680
+ charset: nil,
681
+ generator: nil,
682
+ viewport: nil,
683
+ theme_color: nil,
684
+ application_name: nil,
685
+ robots: nil,
686
+ open_graph: {},
687
+ twitter_card: {},
688
+ meta_tags: {},
689
+ headers: headers,
690
+ links: [],
691
+ images: [],
692
+ structured_data: []
693
+ )
694
+
695
+ assert_equal 4, metadata.headers.length
696
+ assert_equal 1, metadata.headers[0].level
697
+ assert_equal 3, metadata.headers[2].level
698
+ assert_equal 'sec-1', metadata.headers[2].id
699
+ end
700
+
701
+ def test_links_with_various_types
702
+ links = [
703
+ Kreuzberg::LinkMetadata.new(
704
+ href: 'https://external.com',
705
+ text: 'External',
706
+ title: nil,
707
+ link_type: 'external',
708
+ rel: ['noopener'],
709
+ attributes: {}
710
+ ),
711
+ Kreuzberg::LinkMetadata.new(
712
+ href: '/internal/page',
713
+ text: 'Internal',
714
+ title: 'Internal Page',
715
+ link_type: 'internal',
716
+ rel: [],
717
+ attributes: { 'class' => 'nav-link' }
718
+ ),
719
+ Kreuzberg::LinkMetadata.new(
720
+ href: '#section',
721
+ text: 'Anchor',
722
+ title: nil,
723
+ link_type: 'anchor',
724
+ rel: [],
725
+ attributes: {}
726
+ )
727
+ ]
728
+
729
+ metadata = Kreuzberg::HtmlMetadata.new(
730
+ title: nil,
731
+ description: nil,
732
+ author: nil,
733
+ copyright: nil,
734
+ keywords: [],
735
+ canonical_url: nil,
736
+ language: nil,
737
+ text_direction: nil,
738
+ mime_type: nil,
739
+ charset: nil,
740
+ generator: nil,
741
+ viewport: nil,
742
+ theme_color: nil,
743
+ application_name: nil,
744
+ robots: nil,
745
+ open_graph: {},
746
+ twitter_card: {},
747
+ meta_tags: {},
748
+ headers: [],
749
+ links: links,
750
+ images: [],
751
+ structured_data: []
752
+ )
753
+
754
+ assert_equal 3, metadata.links.length
755
+ assert_equal 'external', metadata.links[0].link_type
756
+ assert_equal 'internal', metadata.links[1].link_type
757
+ assert_equal 'anchor', metadata.links[2].link_type
758
+ assert_equal 'nav-link', metadata.links[1].attributes['class']
759
+ end
760
+
761
+ def test_images_with_attributes
762
+ images = [
763
+ Kreuzberg::ImageMetadata.new(
764
+ src: 'logo.png',
765
+ alt: 'Logo',
766
+ title: nil,
767
+ dimensions: [200, 100],
768
+ image_type: 'png',
769
+ attributes: { 'class' => 'logo', 'loading' => 'eager' }
770
+ ),
771
+ Kreuzberg::ImageMetadata.new(
772
+ src: 'thumbnail.jpg',
773
+ alt: nil,
774
+ title: 'Thumbnail',
775
+ dimensions: nil,
776
+ image_type: 'jpg',
777
+ attributes: { 'loading' => 'lazy', 'decoding' => 'async' }
778
+ )
779
+ ]
780
+
781
+ metadata = Kreuzberg::HtmlMetadata.new(
782
+ title: nil,
783
+ description: nil,
784
+ author: nil,
785
+ copyright: nil,
786
+ keywords: [],
787
+ canonical_url: nil,
788
+ language: nil,
789
+ text_direction: nil,
790
+ mime_type: nil,
791
+ charset: nil,
792
+ generator: nil,
793
+ viewport: nil,
794
+ theme_color: nil,
795
+ application_name: nil,
796
+ robots: nil,
797
+ open_graph: {},
798
+ twitter_card: {},
799
+ meta_tags: {},
800
+ headers: [],
801
+ links: [],
802
+ images: images,
803
+ structured_data: []
804
+ )
805
+
806
+ assert_equal 2, metadata.images.length
807
+ assert_equal [200, 100], metadata.images[0].dimensions
808
+ assert_nil metadata.images[1].dimensions
809
+ assert_equal 'lazy', metadata.images[1].attributes['loading']
810
+ end
811
+
812
+ def test_structured_data_multiple_types
813
+ json_ld = '{"@context":"https://schema.org","@type":"Article"}'
814
+ microdata = '{"type":"http://schema.org/Person"}'
815
+
816
+ structured_data = [
817
+ Kreuzberg::StructuredData.new(
818
+ data_type: 'json-ld',
819
+ raw_json: json_ld,
820
+ schema_type: 'Article'
821
+ ),
822
+ Kreuzberg::StructuredData.new(
823
+ data_type: 'microdata',
824
+ raw_json: microdata,
825
+ schema_type: 'Person'
826
+ ),
827
+ Kreuzberg::StructuredData.new(
828
+ data_type: 'json-ld',
829
+ raw_json: '{"@type":"Organization"}',
830
+ schema_type: nil
831
+ )
832
+ ]
833
+
834
+ metadata = Kreuzberg::HtmlMetadata.new(
835
+ title: nil,
836
+ description: nil,
837
+ author: nil,
838
+ copyright: nil,
839
+ keywords: [],
840
+ canonical_url: nil,
841
+ language: nil,
842
+ text_direction: nil,
843
+ mime_type: nil,
844
+ charset: nil,
845
+ generator: nil,
846
+ viewport: nil,
847
+ theme_color: nil,
848
+ application_name: nil,
849
+ robots: nil,
850
+ open_graph: {},
851
+ twitter_card: {},
852
+ meta_tags: {},
853
+ headers: [],
854
+ links: [],
855
+ images: [],
856
+ structured_data: structured_data
857
+ )
858
+
859
+ assert_equal 3, metadata.structured_data.length
860
+ assert_equal 'json-ld', metadata.structured_data[0].data_type
861
+ assert_equal 'Article', metadata.structured_data[0].schema_type
862
+ assert_equal 'microdata', metadata.structured_data[1].data_type
863
+ assert_nil metadata.structured_data[2].schema_type
864
+ end
865
+
866
+ def test_html_metadata_with_all_fields_populated
867
+ headers = [
868
+ Kreuzberg::HeaderMetadata.new(level: 1, text: 'Title', id: 'title', depth: 0, html_offset: 100)
869
+ ]
870
+ links = [
871
+ Kreuzberg::LinkMetadata.new(
872
+ href: 'https://example.com',
873
+ text: 'Example',
874
+ title: 'Example Site',
875
+ link_type: 'external',
876
+ rel: ['noopener'],
877
+ attributes: { 'data-track' => 'true' }
878
+ )
879
+ ]
880
+ images = [
881
+ Kreuzberg::ImageMetadata.new(
882
+ src: 'image.jpg',
883
+ alt: 'Test Image',
884
+ title: nil,
885
+ dimensions: [300, 200],
886
+ image_type: 'jpg',
887
+ attributes: { 'loading' => 'lazy' }
888
+ )
889
+ ]
890
+ structured = [
891
+ Kreuzberg::StructuredData.new(
892
+ data_type: 'json-ld',
893
+ raw_json: '{"@type":"WebPage"}',
894
+ schema_type: 'WebPage'
895
+ )
896
+ ]
897
+
898
+ metadata = Kreuzberg::HtmlMetadata.new(
899
+ title: 'Complete Test Page',
900
+ description: 'A complete test page with all metadata',
901
+ author: 'Test Author',
902
+ copyright: '2024 Test Corp',
903
+ keywords: %w[test comprehensive metadata],
904
+ canonical_url: 'https://example.com/test',
905
+ language: 'en',
906
+ text_direction: 'ltr',
907
+ mime_type: 'text/html; charset=utf-8',
908
+ charset: 'utf-8',
909
+ generator: 'Kreuzberg',
910
+ viewport: 'width=device-width, initial-scale=1',
911
+ theme_color: '#ffffff',
912
+ application_name: 'Test App',
913
+ robots: 'index, follow',
914
+ open_graph: {
915
+ 'og:title' => 'Test',
916
+ 'og:description' => 'Description',
917
+ 'og:image' => 'https://example.com/image.jpg'
918
+ },
919
+ twitter_card: {
920
+ 'twitter:card' => 'summary_large_image',
921
+ 'twitter:title' => 'Test'
922
+ },
923
+ meta_tags: {
924
+ 'custom-tag' => 'custom-value'
925
+ },
926
+ headers: headers,
927
+ links: links,
928
+ images: images,
929
+ structured_data: structured
930
+ )
931
+
932
+ assert_equal 'Complete Test Page', metadata.title
933
+ assert_equal 'A complete test page with all metadata', metadata.description
934
+ assert_equal 'Test Author', metadata.author
935
+ assert_equal '2024 Test Corp', metadata.copyright
936
+ assert_equal 3, metadata.keywords.length
937
+ assert_equal 'https://example.com/test', metadata.canonical_url
938
+ assert_equal 'en', metadata.language
939
+ assert_equal 'ltr', metadata.text_direction
940
+ assert_equal 'Kreuzberg', metadata.generator
941
+ assert_equal 3, metadata.open_graph.length
942
+ assert_equal 2, metadata.twitter_card.length
943
+ assert_equal 1, metadata.meta_tags.length
944
+ assert_equal 1, metadata.headers.length
945
+ assert_equal 1, metadata.links.length
946
+ assert_equal 1, metadata.images.length
947
+ assert_equal 1, metadata.structured_data.length
948
+ end
949
+
950
+ private
951
+
952
+ def create_test_html_file(content)
953
+ file = Tempfile.new(['test', '.html'])
954
+ file.write(content)
955
+ file.close
956
+ file.path
957
+ end
958
+ end
959
+ # rubocop:enable Metrics/ClassLength, Metrics/MethodLength, Metrics/AbcSize