kreuzcrawl 0.3.0.pre.rc.71 → 0.3.0.pre.rc.72

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f9dc9a2b925bd356635ef537594ccf72049f8bbfa47754f348fe0643443a4c9f
4
- data.tar.gz: 06b86af2e1ef230a61a1c9c81d84115e2346e558de94df0d17c08097e63c84e1
3
+ metadata.gz: 8edea9a39a5cbe88bec892ae7fc8de37f0a4b4d5dc3430f5a6e4c0ccb475e443
4
+ data.tar.gz: 3dd64e9b4d361cc4d3ec82178260aaa018b0ca1d1d2f76034cb20ed05c23cefd
5
5
  SHA512:
6
- metadata.gz: 490013779c30677fd6f3618275bb0bd9f11f0da30c43b320e2fda3b814762d18fac68391c9b6e91c9b9f80a5f3cc4ee6a008d48e4897b6dd79c314af085e5d5b
7
- data.tar.gz: 6c8b027fc5db3dd04aa20b034a09e80a4041d527e688131a0bdfbc2f87a9390bef47adc14db5e096a195d69ddcc45ad17d68f43dbb91846ce09fba8919d6d5c0
6
+ metadata.gz: 56ac6e332c9e4cba97c1adb271a6e36dd1fb0c8eb1b9549950d7048c208e9ecf2f9bdb8529dd326d2b5bece9fa2d56f961e8199b2c5667167ee5965ffa3f7596
7
+ data.tar.gz: b84490766d8ade6a489ba72ee7711c25b6e16e75861073f9b3c4346c8d0da4ecdca2321c9f590860070260ab9ae144429e4837ae9fced893b0983feee3718dd0
data/README.md CHANGED
@@ -141,12 +141,13 @@ Contributions are welcome! Please see our [Contributing Guide](https://github.co
141
141
 
142
142
  ## Part of Kreuzberg.dev
143
143
 
144
- - [Kreuzberg](https://github.com/kreuzberg-dev/kreuzberg) — document intelligence: text, tables, metadata from 90+ formats with optional OCR.
144
+ - [Kreuzberg](https://github.com/kreuzberg-dev/kreuzberg) — document intelligence: text, tables, metadata from 91+ formats with optional OCR.
145
145
  - [Kreuzberg Cloud](https://github.com/kreuzberg-dev/kreuzberg-cloud) — managed extraction API with SDKs, dashboards, and observability.
146
+ - [kreuzcrawl](https://github.com/kreuzberg-dev/kreuzcrawl) — web crawling and scraping with HTML→Markdown and headless-Chrome fallback.
146
147
  - [html-to-markdown](https://github.com/kreuzberg-dev/html-to-markdown) — fast, lossless HTML→Markdown engine.
147
148
  - [liter-llm](https://github.com/kreuzberg-dev/liter-llm) — universal LLM API client with native bindings for 14 languages and 143 providers.
148
149
  - [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — tree-sitter grammars and code-intelligence primitives.
149
- - [alef](https://github.com/kreuzberg-dev/alef) — the polyglot binding generator that produces this README and all per-language bindings.
150
+ - [alef](https://github.com/kreuzberg-dev/alef) — the polyglot binding generator that produces every per-language binding across the 5 polyglot repos.
150
151
  - [Discord](https://discord.gg/xt9WY3GnKR) — community, roadmap, announcements.
151
152
 
152
153
  ## License
@@ -158,4 +159,3 @@ This project is licensed under [Elastic License 2.0](https://github.com/kreuzber
158
159
  - [Documentation](https://docs.kreuzcrawl.kreuzberg.dev)
159
160
  - [GitHub Repository](https://github.com/kreuzberg-dev/kreuzcrawl)
160
161
  - [Issue Tracker](https://github.com/kreuzberg-dev/kreuzcrawl/issues)
161
- - [Issues](https://github.com/kreuzberg-dev/kreuzcrawl/issues)
@@ -1460,9 +1460,9 @@ dependencies = [
1460
1460
 
1461
1461
  [[package]]
1462
1462
  name = "kreuzcrawl"
1463
- version = "0.3.0-rc.71"
1463
+ version = "0.3.0-rc.72"
1464
1464
  source = "registry+https://github.com/rust-lang/crates.io-index"
1465
- checksum = "e8c85e79454c048ec41289dfb0168e87ff1c5a0a28997de8cc0dbd3ab331c8e2"
1465
+ checksum = "8a83f0115e5fce77f5f6a4c53d572b8c151038c0fcd16c641db2e8b499603582"
1466
1466
  dependencies = [
1467
1467
  "ahash",
1468
1468
  "aho-corasick",
@@ -1502,7 +1502,7 @@ dependencies = [
1502
1502
 
1503
1503
  [[package]]
1504
1504
  name = "kreuzcrawl-rb"
1505
- version = "0.3.0-rc.71"
1505
+ version = "0.3.0-rc.72"
1506
1506
  dependencies = [
1507
1507
  "futures",
1508
1508
  "kreuzcrawl",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzcrawl-rb"
3
- version = "0.3.0-rc.71"
3
+ version = "0.3.0-rc.72"
4
4
  edition = "2024"
5
5
  license = "Elastic-2.0"
6
6
  description = "High-performance web crawling engine"
@@ -18,7 +18,7 @@ crate-type = ["cdylib"]
18
18
 
19
19
  [dependencies]
20
20
  futures = "0.3"
21
- kreuzcrawl = { version = "0.3.0-rc.71", features = ["interact", "browser-chromiumoxide"] }
21
+ kreuzcrawl = { version = "0.3.0-rc.72", features = ["interact", "browser-chromiumoxide"] }
22
22
  magnus = "0.8"
23
23
  rb-sys = ">=0.9, <0.9.128"
24
24
  serde = { version = "1", features = ["derive"] }
@@ -1,5 +1,5 @@
1
1
  // This file is auto-generated by alef. DO NOT EDIT.
2
- // alef:hash:f2acf0fe68bedde7da1909fa6feedaffb995d5aec49220ec23bf64125a392d19
2
+ // alef:hash:bcc3d5bd544a6e7593876ab5de51eb8c1f53d0d8d2c8c3d31c436d54ead6b3ec
3
3
  // Re-generate with: alef generate
4
4
  #![allow(dead_code, unused_imports, unused_variables)]
5
5
  #![allow(
@@ -5446,8 +5446,6 @@ impl From<CrawlConfig> for kreuzcrawl::CrawlConfig {
5446
5446
  warc_output: val.warc_output.map(Into::into),
5447
5447
  browser_profile: val.browser_profile,
5448
5448
  save_browser_profile: val.save_browser_profile,
5449
- ssrf: Default::default(),
5450
- dispatch: Default::default(),
5451
5449
  ..Default::default()
5452
5450
  }
5453
5451
  }
@@ -5525,17 +5523,18 @@ impl From<kreuzcrawl::BrowserExtras> for BrowserExtras {
5525
5523
  }
5526
5524
  }
5527
5525
 
5526
+ #[allow(clippy::needless_update)]
5528
5527
  #[allow(clippy::redundant_closure, clippy::useless_conversion)]
5529
5528
  impl From<DownloadedDocument> for kreuzcrawl::DownloadedDocument {
5530
5529
  fn from(val: DownloadedDocument) -> Self {
5531
5530
  Self {
5532
5531
  url: val.url,
5533
5532
  mime_type: val.mime_type.into(),
5534
- content: Default::default(),
5535
5533
  size: val.size,
5536
5534
  filename: val.filename.map(Into::into),
5537
5535
  content_hash: val.content_hash.into(),
5538
5536
  headers: val.headers.into_iter().map(|(k, v)| (k.into(), v.into())).collect(),
5537
+ ..Default::default()
5539
5538
  }
5540
5539
  }
5541
5540
  }
@@ -5554,6 +5553,7 @@ impl From<kreuzcrawl::DownloadedDocument> for DownloadedDocument {
5554
5553
  }
5555
5554
  }
5556
5555
 
5556
+ #[allow(clippy::needless_update)]
5557
5557
  #[allow(clippy::redundant_closure, clippy::useless_conversion)]
5558
5558
  impl From<InteractionResult> for kreuzcrawl::InteractionResult {
5559
5559
  fn from(val: InteractionResult) -> Self {
@@ -5561,7 +5561,7 @@ impl From<InteractionResult> for kreuzcrawl::InteractionResult {
5561
5561
  action_results: val.action_results.into_iter().map(Into::into).collect(),
5562
5562
  final_html: val.final_html,
5563
5563
  final_url: val.final_url,
5564
- screenshot: Default::default(),
5564
+ ..Default::default()
5565
5565
  }
5566
5566
  }
5567
5567
  }
@@ -5603,6 +5603,7 @@ impl From<kreuzcrawl::ActionResult> for ActionResult {
5603
5603
  }
5604
5604
  }
5605
5605
 
5606
+ #[allow(clippy::needless_update)]
5606
5607
  #[allow(clippy::redundant_closure, clippy::useless_conversion)]
5607
5608
  impl From<ScrapeResult> for kreuzcrawl::ScrapeResult {
5608
5609
  fn from(val: ScrapeResult) -> Self {
@@ -5633,9 +5634,9 @@ impl From<ScrapeResult> for kreuzcrawl::ScrapeResult {
5633
5634
  markdown: val.markdown.map(Into::into),
5634
5635
  extracted_data: val.extracted_data.as_ref().and_then(|s| serde_json::from_str(s).ok()),
5635
5636
  extraction_meta: val.extraction_meta.map(Into::into),
5636
- screenshot: Default::default(),
5637
5637
  downloaded_document: val.downloaded_document.map(Into::into),
5638
5638
  browser: val.browser.map(Into::into),
5639
+ ..Default::default()
5639
5640
  }
5640
5641
  }
5641
5642
  }
@@ -5734,6 +5735,7 @@ impl From<kreuzcrawl::CrawlPageResult> for CrawlPageResult {
5734
5735
  }
5735
5736
  }
5736
5737
 
5738
+ #[allow(clippy::needless_update)]
5737
5739
  #[allow(clippy::redundant_closure, clippy::useless_conversion)]
5738
5740
  impl From<CrawlResult> for kreuzcrawl::CrawlResult {
5739
5741
  fn from(val: CrawlResult) -> Self {
@@ -5746,7 +5748,7 @@ impl From<CrawlResult> for kreuzcrawl::CrawlResult {
5746
5748
  cookies: val.cookies.into_iter().map(Into::into).collect(),
5747
5749
  stayed_on_domain: val.stayed_on_domain,
5748
5750
  browser_used: val.browser_used,
5749
- normalized_urls: Default::default(),
5751
+ ..Default::default()
5750
5752
  }
5751
5753
  }
5752
5754
  }
@@ -1,5 +1,5 @@
1
1
  # This file is auto-generated by alef — DO NOT EDIT.
2
- # alef:hash:f2acf0fe68bedde7da1909fa6feedaffb995d5aec49220ec23bf64125a392d19
2
+ # alef:hash:bcc3d5bd544a6e7593876ab5de51eb8c1f53d0d8d2c8c3d31c436d54ead6b3ec
3
3
  # To regenerate: alef generate
4
4
  # To verify freshness: alef verify --exit-code
5
5
  # frozen_string_literal: true
@@ -106,10 +106,7 @@ module Kreuzcrawl
106
106
  # Not available on `wasm32` targets — streaming requires native concurrency
107
107
  # primitives (tokio channels, `JoinSet`) that are not supported on wasm32.
108
108
  #
109
- # Delivered to bindings via alef's streaming-adapter pattern. The
110
- # `crawl_stream` / `batch_crawl_stream` binding wrappers in `bindings.rs`
111
- # expose this as the per-language streaming idiom (Python `AsyncIterator`,
112
- # Ruby `Enumerator`, PHP `Generator`, Elixir `Stream.unfold`, etc.).
109
+ # Delivered to bindings through each target's native streaming idiom.
113
110
  module CrawlEvent
114
111
  extend T::Helpers
115
112
  extend T::Sig
@@ -1,10 +1,10 @@
1
1
  # This file is auto-generated by alef — DO NOT EDIT.
2
- # alef:hash:f2acf0fe68bedde7da1909fa6feedaffb995d5aec49220ec23bf64125a392d19
2
+ # alef:hash:bcc3d5bd544a6e7593876ab5de51eb8c1f53d0d8d2c8c3d31c436d54ead6b3ec
3
3
  # To regenerate: alef generate
4
4
  # To verify freshness: alef verify --exit-code
5
5
  # frozen_string_literal: true
6
6
 
7
7
  module Kreuzcrawl
8
8
  ## The version string for this package.
9
- VERSION = "0.3.0.pre.rc.71"
9
+ VERSION = "0.3.0.pre.rc.72"
10
10
  end
data/lib/kreuzcrawl.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # This file is auto-generated by alef — DO NOT EDIT.
2
- # alef:hash:f2acf0fe68bedde7da1909fa6feedaffb995d5aec49220ec23bf64125a392d19
2
+ # alef:hash:bcc3d5bd544a6e7593876ab5de51eb8c1f53d0d8d2c8c3d31c436d54ead6b3ec
3
3
  # To regenerate: alef generate
4
4
  # To verify freshness: alef verify --exit-code
5
5
  # frozen_string_literal: true
data/lib/kreuzcrawl_rb.so CHANGED
Binary file
data/sig/types.rbs CHANGED
@@ -1,5 +1,5 @@
1
1
  # This file is auto-generated by alef — DO NOT EDIT.
2
- # alef:hash:f2acf0fe68bedde7da1909fa6feedaffb995d5aec49220ec23bf64125a392d19
2
+ # alef:hash:bcc3d5bd544a6e7593876ab5de51eb8c1f53d0d8d2c8c3d31c436d54ead6b3ec
3
3
  # To regenerate: alef generate
4
4
  # To verify freshness: alef verify --exit-code
5
5
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzcrawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0.pre.rc.71
4
+ version: 0.3.0.pre.rc.72
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kreuzberg Team