multiwoven-integrations 0.30.1 → 0.30.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9fd3ed70334fa8ca4cf73fa9c6e47f253e372d9b9358ee3a1004de5880e4656b
|
4
|
+
data.tar.gz: 25a4d2b8442bd87197f84cc7c36fe3bbb7864cc0f329312fba37d9742cbc8bee
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: '08384a20d53b8b87472c509aedfaf37d68b3445a722686ba49b7fd86a4ef044eeeae7a60fa6cc1191f475b0c221ee6937a01ed635b5df784c1f5c494a9828cf5'
|
7
|
+
data.tar.gz: '085bb3fdb255ae339486d00e4747c9e97d5784e5b344dac3e5870c99229dfa71693e8d11831d9661cade3a879a42998e7c650638585be9c152e1b5a4d326702a'
|
@@ -18,12 +18,9 @@ module Multiwoven::Integrations::Source
|
|
18
18
|
failure_status(e)
|
19
19
|
end
|
20
20
|
|
21
|
-
def discover(
|
22
|
-
|
23
|
-
|
24
|
-
response = execute_scrape(FIRECRAWL_SCRAPE_URL)
|
25
|
-
results = JSON.parse(response.body)
|
26
|
-
catalog = Catalog.new(streams: create_streams(results))
|
21
|
+
def discover(_connection_config = nil)
|
22
|
+
catalog_json = read_json(CATALOG_SPEC_PATH)
|
23
|
+
catalog = build_catalog(catalog_json)
|
27
24
|
catalog.to_multiwoven_message
|
28
25
|
rescue StandardError => e
|
29
26
|
handle_exception(e, { context: "FIRECRAWL:DISCOVER:EXCEPTION", type: "error" })
|
@@ -32,9 +29,8 @@ module Multiwoven::Integrations::Source
|
|
32
29
|
def read(sync_config)
|
33
30
|
connection_config = sync_config.source.connection_specification
|
34
31
|
connection_config = connection_config.with_indifferent_access
|
35
|
-
query = sync_config.model.query
|
36
32
|
url = create_connection(connection_config)
|
37
|
-
query(url,
|
33
|
+
query(url, nil, nil)
|
38
34
|
rescue StandardError => e
|
39
35
|
handle_exception(e, {
|
40
36
|
context: "FIRECRAWL:READ:EXCEPTION",
|
@@ -62,7 +58,15 @@ module Multiwoven::Integrations::Source
|
|
62
58
|
FIRECRAWL_CRAWL_URL
|
63
59
|
end
|
64
60
|
|
65
|
-
def query(url, _query)
|
61
|
+
def query(url, _query, limit = 1)
|
62
|
+
if limit.present?
|
63
|
+
if @config["includePaths"]&.any?
|
64
|
+
path = @config["includePaths"].first
|
65
|
+
@config["url"] = URI.join(@config["url"], path).to_s
|
66
|
+
end
|
67
|
+
@config.delete("includePaths")
|
68
|
+
@config[:limit] = limit
|
69
|
+
end
|
66
70
|
request = execute_crawl(url)
|
67
71
|
request = JSON.parse(request.body)
|
68
72
|
crawl_url = get_request_url(request)
|
@@ -89,16 +93,6 @@ module Multiwoven::Integrations::Source
|
|
89
93
|
)
|
90
94
|
end
|
91
95
|
|
92
|
-
def execute_scrape(url)
|
93
|
-
send_request(
|
94
|
-
url: url,
|
95
|
-
http_method: HTTP_POST,
|
96
|
-
payload: JSON.parse({ "url": @base_url }.to_json),
|
97
|
-
headers: auth_headers(@api_key),
|
98
|
-
config: {}
|
99
|
-
)
|
100
|
-
end
|
101
|
-
|
102
96
|
def crawl_activity
|
103
97
|
send_request(
|
104
98
|
url: FIRECRAWL_CRAWL_ACTIVE_URL,
|
@@ -148,37 +142,6 @@ module Multiwoven::Integrations::Source
|
|
148
142
|
end
|
149
143
|
end
|
150
144
|
|
151
|
-
def create_streams(records)
|
152
|
-
group_by_table(records).map do |r|
|
153
|
-
Multiwoven::Integrations::Protocol::Stream.new(name: r[:tablename], action: StreamAction["fetch"], json_schema: convert_to_json_schema(r[:columns]))
|
154
|
-
end
|
155
|
-
end
|
156
|
-
|
157
|
-
def group_by_table(response)
|
158
|
-
columns = response["data"].map do |key, value|
|
159
|
-
{
|
160
|
-
column_name: key,
|
161
|
-
data_type: "string",
|
162
|
-
is_nullable: value.nil?
|
163
|
-
}
|
164
|
-
end
|
165
|
-
|
166
|
-
if response["data"]["metadata"]["url"]
|
167
|
-
columns << {
|
168
|
-
column_name: "url",
|
169
|
-
data_type: "string",
|
170
|
-
is_nullable: response["data"]["metadata"]["url"].nil?
|
171
|
-
}
|
172
|
-
end
|
173
|
-
|
174
|
-
[
|
175
|
-
{
|
176
|
-
tablename: "scrape",
|
177
|
-
columns: columns
|
178
|
-
}
|
179
|
-
]
|
180
|
-
end
|
181
|
-
|
182
145
|
def build_url(url, id)
|
183
146
|
format(url, id: id)
|
184
147
|
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
{
|
2
|
+
"request_rate_limit": 15,
|
3
|
+
"request_rate_limit_unit": "minute",
|
4
|
+
"request_rate_concurrency": 5,
|
5
|
+
"streams": [
|
6
|
+
{
|
7
|
+
"name": "scrape",
|
8
|
+
"action": "fetch",
|
9
|
+
"json_schema": {
|
10
|
+
"type": "object",
|
11
|
+
"properties": {
|
12
|
+
"markdown": {
|
13
|
+
"type": "string"
|
14
|
+
},
|
15
|
+
"metadata": {
|
16
|
+
"type": "string"
|
17
|
+
},
|
18
|
+
"url": {
|
19
|
+
"type": "string"
|
20
|
+
}
|
21
|
+
}
|
22
|
+
},
|
23
|
+
"supported_sync_modes": ["incremental"]
|
24
|
+
}
|
25
|
+
]
|
26
|
+
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: multiwoven-integrations
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.30.
|
4
|
+
version: 0.30.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Subin T P
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-06-
|
11
|
+
date: 2025-06-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -752,6 +752,7 @@ files:
|
|
752
752
|
- lib/multiwoven/integrations/source/databrics_model/config/spec.json
|
753
753
|
- lib/multiwoven/integrations/source/databrics_model/icon.svg
|
754
754
|
- lib/multiwoven/integrations/source/firecrawl/client.rb
|
755
|
+
- lib/multiwoven/integrations/source/firecrawl/config/catalog.json
|
755
756
|
- lib/multiwoven/integrations/source/firecrawl/config/meta.json
|
756
757
|
- lib/multiwoven/integrations/source/firecrawl/config/spec.json
|
757
758
|
- lib/multiwoven/integrations/source/firecrawl/icon.svg
|