kreuzcrawl 0.3.0.pre.rc.71 → 0.3.0.pre.rc.72
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +3 -3
- data/ext/kreuzcrawl_rb/native/Cargo.lock +3 -3
- data/ext/kreuzcrawl_rb/native/Cargo.toml +2 -2
- data/ext/kreuzcrawl_rb/src/lib.rs +9 -7
- data/lib/kreuzcrawl/native.rb +2 -5
- data/lib/kreuzcrawl/version.rb +2 -2
- data/lib/kreuzcrawl.rb +1 -1
- data/lib/kreuzcrawl_rb.so +0 -0
- data/sig/types.rbs +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 8edea9a39a5cbe88bec892ae7fc8de37f0a4b4d5dc3430f5a6e4c0ccb475e443
|
|
4
|
+
data.tar.gz: 3dd64e9b4d361cc4d3ec82178260aaa018b0ca1d1d2f76034cb20ed05c23cefd
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 56ac6e332c9e4cba97c1adb271a6e36dd1fb0c8eb1b9549950d7048c208e9ecf2f9bdb8529dd326d2b5bece9fa2d56f961e8199b2c5667167ee5965ffa3f7596
|
|
7
|
+
data.tar.gz: b84490766d8ade6a489ba72ee7711c25b6e16e75861073f9b3c4346c8d0da4ecdca2321c9f590860070260ab9ae144429e4837ae9fced893b0983feee3718dd0
|
data/README.md
CHANGED
|
@@ -141,12 +141,13 @@ Contributions are welcome! Please see our [Contributing Guide](https://github.co
|
|
|
141
141
|
|
|
142
142
|
## Part of Kreuzberg.dev
|
|
143
143
|
|
|
144
|
-
- [Kreuzberg](https://github.com/kreuzberg-dev/kreuzberg) — document intelligence: text, tables, metadata from
|
|
144
|
+
- [Kreuzberg](https://github.com/kreuzberg-dev/kreuzberg) — document intelligence: text, tables, metadata from 91+ formats with optional OCR.
|
|
145
145
|
- [Kreuzberg Cloud](https://github.com/kreuzberg-dev/kreuzberg-cloud) — managed extraction API with SDKs, dashboards, and observability.
|
|
146
|
+
- [kreuzcrawl](https://github.com/kreuzberg-dev/kreuzcrawl) — web crawling and scraping with HTML→Markdown and headless-Chrome fallback.
|
|
146
147
|
- [html-to-markdown](https://github.com/kreuzberg-dev/html-to-markdown) — fast, lossless HTML→Markdown engine.
|
|
147
148
|
- [liter-llm](https://github.com/kreuzberg-dev/liter-llm) — universal LLM API client with native bindings for 14 languages and 143 providers.
|
|
148
149
|
- [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — tree-sitter grammars and code-intelligence primitives.
|
|
149
|
-
- [alef](https://github.com/kreuzberg-dev/alef) — the polyglot binding generator that produces
|
|
150
|
+
- [alef](https://github.com/kreuzberg-dev/alef) — the polyglot binding generator that produces every per-language binding across the 5 polyglot repos.
|
|
150
151
|
- [Discord](https://discord.gg/xt9WY3GnKR) — community, roadmap, announcements.
|
|
151
152
|
|
|
152
153
|
## License
|
|
@@ -158,4 +159,3 @@ This project is licensed under [Elastic License 2.0](https://github.com/kreuzber
|
|
|
158
159
|
- [Documentation](https://docs.kreuzcrawl.kreuzberg.dev)
|
|
159
160
|
- [GitHub Repository](https://github.com/kreuzberg-dev/kreuzcrawl)
|
|
160
161
|
- [Issue Tracker](https://github.com/kreuzberg-dev/kreuzcrawl/issues)
|
|
161
|
-
- [Issues](https://github.com/kreuzberg-dev/kreuzcrawl/issues)
|
|
@@ -1460,9 +1460,9 @@ dependencies = [
|
|
|
1460
1460
|
|
|
1461
1461
|
[[package]]
|
|
1462
1462
|
name = "kreuzcrawl"
|
|
1463
|
-
version = "0.3.0-rc.
|
|
1463
|
+
version = "0.3.0-rc.72"
|
|
1464
1464
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1465
|
-
checksum = "
|
|
1465
|
+
checksum = "8a83f0115e5fce77f5f6a4c53d572b8c151038c0fcd16c641db2e8b499603582"
|
|
1466
1466
|
dependencies = [
|
|
1467
1467
|
"ahash",
|
|
1468
1468
|
"aho-corasick",
|
|
@@ -1502,7 +1502,7 @@ dependencies = [
|
|
|
1502
1502
|
|
|
1503
1503
|
[[package]]
|
|
1504
1504
|
name = "kreuzcrawl-rb"
|
|
1505
|
-
version = "0.3.0-rc.
|
|
1505
|
+
version = "0.3.0-rc.72"
|
|
1506
1506
|
dependencies = [
|
|
1507
1507
|
"futures",
|
|
1508
1508
|
"kreuzcrawl",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzcrawl-rb"
|
|
3
|
-
version = "0.3.0-rc.
|
|
3
|
+
version = "0.3.0-rc.72"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
license = "Elastic-2.0"
|
|
6
6
|
description = "High-performance web crawling engine"
|
|
@@ -18,7 +18,7 @@ crate-type = ["cdylib"]
|
|
|
18
18
|
|
|
19
19
|
[dependencies]
|
|
20
20
|
futures = "0.3"
|
|
21
|
-
kreuzcrawl = { version = "0.3.0-rc.
|
|
21
|
+
kreuzcrawl = { version = "0.3.0-rc.72", features = ["interact", "browser-chromiumoxide"] }
|
|
22
22
|
magnus = "0.8"
|
|
23
23
|
rb-sys = ">=0.9, <0.9.128"
|
|
24
24
|
serde = { version = "1", features = ["derive"] }
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
// This file is auto-generated by alef. DO NOT EDIT.
|
|
2
|
-
// alef:hash:
|
|
2
|
+
// alef:hash:bcc3d5bd544a6e7593876ab5de51eb8c1f53d0d8d2c8c3d31c436d54ead6b3ec
|
|
3
3
|
// Re-generate with: alef generate
|
|
4
4
|
#![allow(dead_code, unused_imports, unused_variables)]
|
|
5
5
|
#![allow(
|
|
@@ -5446,8 +5446,6 @@ impl From<CrawlConfig> for kreuzcrawl::CrawlConfig {
|
|
|
5446
5446
|
warc_output: val.warc_output.map(Into::into),
|
|
5447
5447
|
browser_profile: val.browser_profile,
|
|
5448
5448
|
save_browser_profile: val.save_browser_profile,
|
|
5449
|
-
ssrf: Default::default(),
|
|
5450
|
-
dispatch: Default::default(),
|
|
5451
5449
|
..Default::default()
|
|
5452
5450
|
}
|
|
5453
5451
|
}
|
|
@@ -5525,17 +5523,18 @@ impl From<kreuzcrawl::BrowserExtras> for BrowserExtras {
|
|
|
5525
5523
|
}
|
|
5526
5524
|
}
|
|
5527
5525
|
|
|
5526
|
+
#[allow(clippy::needless_update)]
|
|
5528
5527
|
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
|
|
5529
5528
|
impl From<DownloadedDocument> for kreuzcrawl::DownloadedDocument {
|
|
5530
5529
|
fn from(val: DownloadedDocument) -> Self {
|
|
5531
5530
|
Self {
|
|
5532
5531
|
url: val.url,
|
|
5533
5532
|
mime_type: val.mime_type.into(),
|
|
5534
|
-
content: Default::default(),
|
|
5535
5533
|
size: val.size,
|
|
5536
5534
|
filename: val.filename.map(Into::into),
|
|
5537
5535
|
content_hash: val.content_hash.into(),
|
|
5538
5536
|
headers: val.headers.into_iter().map(|(k, v)| (k.into(), v.into())).collect(),
|
|
5537
|
+
..Default::default()
|
|
5539
5538
|
}
|
|
5540
5539
|
}
|
|
5541
5540
|
}
|
|
@@ -5554,6 +5553,7 @@ impl From<kreuzcrawl::DownloadedDocument> for DownloadedDocument {
|
|
|
5554
5553
|
}
|
|
5555
5554
|
}
|
|
5556
5555
|
|
|
5556
|
+
#[allow(clippy::needless_update)]
|
|
5557
5557
|
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
|
|
5558
5558
|
impl From<InteractionResult> for kreuzcrawl::InteractionResult {
|
|
5559
5559
|
fn from(val: InteractionResult) -> Self {
|
|
@@ -5561,7 +5561,7 @@ impl From<InteractionResult> for kreuzcrawl::InteractionResult {
|
|
|
5561
5561
|
action_results: val.action_results.into_iter().map(Into::into).collect(),
|
|
5562
5562
|
final_html: val.final_html,
|
|
5563
5563
|
final_url: val.final_url,
|
|
5564
|
-
|
|
5564
|
+
..Default::default()
|
|
5565
5565
|
}
|
|
5566
5566
|
}
|
|
5567
5567
|
}
|
|
@@ -5603,6 +5603,7 @@ impl From<kreuzcrawl::ActionResult> for ActionResult {
|
|
|
5603
5603
|
}
|
|
5604
5604
|
}
|
|
5605
5605
|
|
|
5606
|
+
#[allow(clippy::needless_update)]
|
|
5606
5607
|
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
|
|
5607
5608
|
impl From<ScrapeResult> for kreuzcrawl::ScrapeResult {
|
|
5608
5609
|
fn from(val: ScrapeResult) -> Self {
|
|
@@ -5633,9 +5634,9 @@ impl From<ScrapeResult> for kreuzcrawl::ScrapeResult {
|
|
|
5633
5634
|
markdown: val.markdown.map(Into::into),
|
|
5634
5635
|
extracted_data: val.extracted_data.as_ref().and_then(|s| serde_json::from_str(s).ok()),
|
|
5635
5636
|
extraction_meta: val.extraction_meta.map(Into::into),
|
|
5636
|
-
screenshot: Default::default(),
|
|
5637
5637
|
downloaded_document: val.downloaded_document.map(Into::into),
|
|
5638
5638
|
browser: val.browser.map(Into::into),
|
|
5639
|
+
..Default::default()
|
|
5639
5640
|
}
|
|
5640
5641
|
}
|
|
5641
5642
|
}
|
|
@@ -5734,6 +5735,7 @@ impl From<kreuzcrawl::CrawlPageResult> for CrawlPageResult {
|
|
|
5734
5735
|
}
|
|
5735
5736
|
}
|
|
5736
5737
|
|
|
5738
|
+
#[allow(clippy::needless_update)]
|
|
5737
5739
|
#[allow(clippy::redundant_closure, clippy::useless_conversion)]
|
|
5738
5740
|
impl From<CrawlResult> for kreuzcrawl::CrawlResult {
|
|
5739
5741
|
fn from(val: CrawlResult) -> Self {
|
|
@@ -5746,7 +5748,7 @@ impl From<CrawlResult> for kreuzcrawl::CrawlResult {
|
|
|
5746
5748
|
cookies: val.cookies.into_iter().map(Into::into).collect(),
|
|
5747
5749
|
stayed_on_domain: val.stayed_on_domain,
|
|
5748
5750
|
browser_used: val.browser_used,
|
|
5749
|
-
|
|
5751
|
+
..Default::default()
|
|
5750
5752
|
}
|
|
5751
5753
|
}
|
|
5752
5754
|
}
|
data/lib/kreuzcrawl/native.rb
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# This file is auto-generated by alef — DO NOT EDIT.
|
|
2
|
-
# alef:hash:
|
|
2
|
+
# alef:hash:bcc3d5bd544a6e7593876ab5de51eb8c1f53d0d8d2c8c3d31c436d54ead6b3ec
|
|
3
3
|
# To regenerate: alef generate
|
|
4
4
|
# To verify freshness: alef verify --exit-code
|
|
5
5
|
# frozen_string_literal: true
|
|
@@ -106,10 +106,7 @@ module Kreuzcrawl
|
|
|
106
106
|
# Not available on `wasm32` targets — streaming requires native concurrency
|
|
107
107
|
# primitives (tokio channels, `JoinSet`) that are not supported on wasm32.
|
|
108
108
|
#
|
|
109
|
-
# Delivered to bindings
|
|
110
|
-
# `crawl_stream` / `batch_crawl_stream` binding wrappers in `bindings.rs`
|
|
111
|
-
# expose this as the per-language streaming idiom (Python `AsyncIterator`,
|
|
112
|
-
# Ruby `Enumerator`, PHP `Generator`, Elixir `Stream.unfold`, etc.).
|
|
109
|
+
# Delivered to bindings through each target's native streaming idiom.
|
|
113
110
|
module CrawlEvent
|
|
114
111
|
extend T::Helpers
|
|
115
112
|
extend T::Sig
|
data/lib/kreuzcrawl/version.rb
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
# This file is auto-generated by alef — DO NOT EDIT.
|
|
2
|
-
# alef:hash:
|
|
2
|
+
# alef:hash:bcc3d5bd544a6e7593876ab5de51eb8c1f53d0d8d2c8c3d31c436d54ead6b3ec
|
|
3
3
|
# To regenerate: alef generate
|
|
4
4
|
# To verify freshness: alef verify --exit-code
|
|
5
5
|
# frozen_string_literal: true
|
|
6
6
|
|
|
7
7
|
module Kreuzcrawl
|
|
8
8
|
## The version string for this package.
|
|
9
|
-
VERSION = "0.3.0.pre.rc.
|
|
9
|
+
VERSION = "0.3.0.pre.rc.72"
|
|
10
10
|
end
|
data/lib/kreuzcrawl.rb
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# This file is auto-generated by alef — DO NOT EDIT.
|
|
2
|
-
# alef:hash:
|
|
2
|
+
# alef:hash:bcc3d5bd544a6e7593876ab5de51eb8c1f53d0d8d2c8c3d31c436d54ead6b3ec
|
|
3
3
|
# To regenerate: alef generate
|
|
4
4
|
# To verify freshness: alef verify --exit-code
|
|
5
5
|
# frozen_string_literal: true
|
data/lib/kreuzcrawl_rb.so
CHANGED
|
Binary file
|
data/sig/types.rbs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# This file is auto-generated by alef — DO NOT EDIT.
|
|
2
|
-
# alef:hash:
|
|
2
|
+
# alef:hash:bcc3d5bd544a6e7593876ab5de51eb8c1f53d0d8d2c8c3d31c436d54ead6b3ec
|
|
3
3
|
# To regenerate: alef generate
|
|
4
4
|
# To verify freshness: alef verify --exit-code
|
|
5
5
|
|