multiwoven-integrations 0.30.1 → 0.30.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 51616b2e7b06336873aceb0c9e5731526dd7e57ba332eab190bc215d99c4ed8d
|
4
|
+
data.tar.gz: d376ae2826566eea31fd92dbd4e4ddf16034e1ef9085edf08f7b554744424583
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fc9778e2da0499ed9bec602005b3b69aadd8e038e251d77acc8407bde99dbf213c1ff0d6f937a832c1d4aa660dbae578de8042caf9c0720fca3ddcff2e0d762a
|
7
|
+
data.tar.gz: a8438815522f82a1bc6db95c6bd29c94a55f182bf459f90c3e01209f54b17e303d939280a63549f6c1eff5da4e57ca1f80e85b98646f6f935995f3a371f7b274
|
@@ -18,12 +18,9 @@ module Multiwoven::Integrations::Source
|
|
18
18
|
failure_status(e)
|
19
19
|
end
|
20
20
|
|
21
|
-
def discover(
|
22
|
-
|
23
|
-
|
24
|
-
response = execute_scrape(FIRECRAWL_SCRAPE_URL)
|
25
|
-
results = JSON.parse(response.body)
|
26
|
-
catalog = Catalog.new(streams: create_streams(results))
|
21
|
+
def discover(_connection_config = nil)
|
22
|
+
catalog_json = read_json(CATALOG_SPEC_PATH)
|
23
|
+
catalog = build_catalog(catalog_json)
|
27
24
|
catalog.to_multiwoven_message
|
28
25
|
rescue StandardError => e
|
29
26
|
handle_exception(e, { context: "FIRECRAWL:DISCOVER:EXCEPTION", type: "error" })
|
@@ -32,9 +29,8 @@ module Multiwoven::Integrations::Source
|
|
32
29
|
def read(sync_config)
|
33
30
|
connection_config = sync_config.source.connection_specification
|
34
31
|
connection_config = connection_config.with_indifferent_access
|
35
|
-
query = sync_config.model.query
|
36
32
|
url = create_connection(connection_config)
|
37
|
-
query(url,
|
33
|
+
query(url, nil)
|
38
34
|
rescue StandardError => e
|
39
35
|
handle_exception(e, {
|
40
36
|
context: "FIRECRAWL:READ:EXCEPTION",
|
@@ -62,7 +58,16 @@ module Multiwoven::Integrations::Source
|
|
62
58
|
FIRECRAWL_CRAWL_URL
|
63
59
|
end
|
64
60
|
|
65
|
-
def query(url,
|
61
|
+
def query(url, query)
|
62
|
+
has_limit = query.match(/LIMIT\s+(\d+)\s*$/i) if query.present?
|
63
|
+
if has_limit.present?
|
64
|
+
if @config["includePaths"]&.any?
|
65
|
+
path = @config["includePaths"].first
|
66
|
+
@config["url"] = URI.join(@config["url"], path).to_s
|
67
|
+
end
|
68
|
+
@config.delete("includePaths")
|
69
|
+
@config[:limit] = has_limit[1].to_i
|
70
|
+
end
|
66
71
|
request = execute_crawl(url)
|
67
72
|
request = JSON.parse(request.body)
|
68
73
|
crawl_url = get_request_url(request)
|
@@ -89,16 +94,6 @@ module Multiwoven::Integrations::Source
|
|
89
94
|
)
|
90
95
|
end
|
91
96
|
|
92
|
-
def execute_scrape(url)
|
93
|
-
send_request(
|
94
|
-
url: url,
|
95
|
-
http_method: HTTP_POST,
|
96
|
-
payload: JSON.parse({ "url": @base_url }.to_json),
|
97
|
-
headers: auth_headers(@api_key),
|
98
|
-
config: {}
|
99
|
-
)
|
100
|
-
end
|
101
|
-
|
102
97
|
def crawl_activity
|
103
98
|
send_request(
|
104
99
|
url: FIRECRAWL_CRAWL_ACTIVE_URL,
|
@@ -148,37 +143,6 @@ module Multiwoven::Integrations::Source
|
|
148
143
|
end
|
149
144
|
end
|
150
145
|
|
151
|
-
def create_streams(records)
|
152
|
-
group_by_table(records).map do |r|
|
153
|
-
Multiwoven::Integrations::Protocol::Stream.new(name: r[:tablename], action: StreamAction["fetch"], json_schema: convert_to_json_schema(r[:columns]))
|
154
|
-
end
|
155
|
-
end
|
156
|
-
|
157
|
-
def group_by_table(response)
|
158
|
-
columns = response["data"].map do |key, value|
|
159
|
-
{
|
160
|
-
column_name: key,
|
161
|
-
data_type: "string",
|
162
|
-
is_nullable: value.nil?
|
163
|
-
}
|
164
|
-
end
|
165
|
-
|
166
|
-
if response["data"]["metadata"]["url"]
|
167
|
-
columns << {
|
168
|
-
column_name: "url",
|
169
|
-
data_type: "string",
|
170
|
-
is_nullable: response["data"]["metadata"]["url"].nil?
|
171
|
-
}
|
172
|
-
end
|
173
|
-
|
174
|
-
[
|
175
|
-
{
|
176
|
-
tablename: "scrape",
|
177
|
-
columns: columns
|
178
|
-
}
|
179
|
-
]
|
180
|
-
end
|
181
|
-
|
182
146
|
def build_url(url, id)
|
183
147
|
format(url, id: id)
|
184
148
|
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
{
|
2
|
+
"request_rate_limit": 15,
|
3
|
+
"request_rate_limit_unit": "minute",
|
4
|
+
"request_rate_concurrency": 5,
|
5
|
+
"streams": [
|
6
|
+
{
|
7
|
+
"name": "scrape",
|
8
|
+
"action": "fetch",
|
9
|
+
"json_schema": {
|
10
|
+
"type": "object",
|
11
|
+
"properties": {
|
12
|
+
"markdown": {
|
13
|
+
"type": "string"
|
14
|
+
},
|
15
|
+
"metadata": {
|
16
|
+
"type": "string"
|
17
|
+
},
|
18
|
+
"url": {
|
19
|
+
"type": "string"
|
20
|
+
}
|
21
|
+
}
|
22
|
+
},
|
23
|
+
"supported_sync_modes": ["incremental"]
|
24
|
+
}
|
25
|
+
]
|
26
|
+
}
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: multiwoven-integrations
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.30.
|
4
|
+
version: 0.30.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Subin T P
|
@@ -752,6 +752,7 @@ files:
|
|
752
752
|
- lib/multiwoven/integrations/source/databrics_model/config/spec.json
|
753
753
|
- lib/multiwoven/integrations/source/databrics_model/icon.svg
|
754
754
|
- lib/multiwoven/integrations/source/firecrawl/client.rb
|
755
|
+
- lib/multiwoven/integrations/source/firecrawl/config/catalog.json
|
755
756
|
- lib/multiwoven/integrations/source/firecrawl/config/meta.json
|
756
757
|
- lib/multiwoven/integrations/source/firecrawl/config/spec.json
|
757
758
|
- lib/multiwoven/integrations/source/firecrawl/icon.svg
|