multiwoven-integrations 0.30.1 → 0.30.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 77b60f06178a034c3b84ac94d582f500a25f2e987c6a3b813b8220e52bf2958c
4
- data.tar.gz: abd9af3678499e495b4f6b6650db793aa9a29a1a7781ff84caad07d1826e1b88
3
+ metadata.gz: 51616b2e7b06336873aceb0c9e5731526dd7e57ba332eab190bc215d99c4ed8d
4
+ data.tar.gz: d376ae2826566eea31fd92dbd4e4ddf16034e1ef9085edf08f7b554744424583
5
5
  SHA512:
6
- metadata.gz: f4535cf7f29ee0dec79736d292018fa18c5d0126e7603401f07dec97a10a8b63efb1f616afcf0b230cd2858f8a7979b2e191d6b169d6c2fb41ea06c9bf64eded
7
- data.tar.gz: b4a3288ae74b92dfcf729d970db0a2cadd670aee1a5ee83f536bab6c42c07394a587b18a2e48fcf913257d0e6d6e97d6283aecf342c3b43b14ae0e53ac93dd95
6
+ metadata.gz: fc9778e2da0499ed9bec602005b3b69aadd8e038e251d77acc8407bde99dbf213c1ff0d6f937a832c1d4aa660dbae578de8042caf9c0720fca3ddcff2e0d762a
7
+ data.tar.gz: a8438815522f82a1bc6db95c6bd29c94a55f182bf459f90c3e01209f54b17e303d939280a63549f6c1eff5da4e57ca1f80e85b98646f6f935995f3a371f7b274
@@ -2,7 +2,7 @@
2
2
 
3
3
  module Multiwoven
4
4
  module Integrations
5
- VERSION = "0.30.1"
5
+ VERSION = "0.30.2"
6
6
 
7
7
  ENABLED_SOURCES = %w[
8
8
  Snowflake
@@ -18,12 +18,9 @@ module Multiwoven::Integrations::Source
18
18
  failure_status(e)
19
19
  end
20
20
 
21
- def discover(connection_config)
22
- connection_config = connection_config.with_indifferent_access
23
- create_connection(connection_config)
24
- response = execute_scrape(FIRECRAWL_SCRAPE_URL)
25
- results = JSON.parse(response.body)
26
- catalog = Catalog.new(streams: create_streams(results))
21
+ def discover(_connection_config = nil)
22
+ catalog_json = read_json(CATALOG_SPEC_PATH)
23
+ catalog = build_catalog(catalog_json)
27
24
  catalog.to_multiwoven_message
28
25
  rescue StandardError => e
29
26
  handle_exception(e, { context: "FIRECRAWL:DISCOVER:EXCEPTION", type: "error" })
@@ -32,9 +29,8 @@ module Multiwoven::Integrations::Source
32
29
  def read(sync_config)
33
30
  connection_config = sync_config.source.connection_specification
34
31
  connection_config = connection_config.with_indifferent_access
35
- query = sync_config.model.query
36
32
  url = create_connection(connection_config)
37
- query(url, query)
33
+ query(url, nil)
38
34
  rescue StandardError => e
39
35
  handle_exception(e, {
40
36
  context: "FIRECRAWL:READ:EXCEPTION",
@@ -62,7 +58,16 @@ module Multiwoven::Integrations::Source
62
58
  FIRECRAWL_CRAWL_URL
63
59
  end
64
60
 
65
- def query(url, _query)
61
+ def query(url, query)
62
+ has_limit = query.match(/LIMIT\s+(\d+)\s*$/i) if query.present?
63
+ if has_limit.present?
64
+ if @config["includePaths"]&.any?
65
+ path = @config["includePaths"].first
66
+ @config["url"] = URI.join(@config["url"], path).to_s
67
+ end
68
+ @config.delete("includePaths")
69
+ @config[:limit] = has_limit[1].to_i
70
+ end
66
71
  request = execute_crawl(url)
67
72
  request = JSON.parse(request.body)
68
73
  crawl_url = get_request_url(request)
@@ -89,16 +94,6 @@ module Multiwoven::Integrations::Source
89
94
  )
90
95
  end
91
96
 
92
- def execute_scrape(url)
93
- send_request(
94
- url: url,
95
- http_method: HTTP_POST,
96
- payload: JSON.parse({ "url": @base_url }.to_json),
97
- headers: auth_headers(@api_key),
98
- config: {}
99
- )
100
- end
101
-
102
97
  def crawl_activity
103
98
  send_request(
104
99
  url: FIRECRAWL_CRAWL_ACTIVE_URL,
@@ -148,37 +143,6 @@ module Multiwoven::Integrations::Source
148
143
  end
149
144
  end
150
145
 
151
- def create_streams(records)
152
- group_by_table(records).map do |r|
153
- Multiwoven::Integrations::Protocol::Stream.new(name: r[:tablename], action: StreamAction["fetch"], json_schema: convert_to_json_schema(r[:columns]))
154
- end
155
- end
156
-
157
- def group_by_table(response)
158
- columns = response["data"].map do |key, value|
159
- {
160
- column_name: key,
161
- data_type: "string",
162
- is_nullable: value.nil?
163
- }
164
- end
165
-
166
- if response["data"]["metadata"]["url"]
167
- columns << {
168
- column_name: "url",
169
- data_type: "string",
170
- is_nullable: response["data"]["metadata"]["url"].nil?
171
- }
172
- end
173
-
174
- [
175
- {
176
- tablename: "scrape",
177
- columns: columns
178
- }
179
- ]
180
- end
181
-
182
146
  def build_url(url, id)
183
147
  format(url, id: id)
184
148
  end
@@ -0,0 +1,26 @@
1
+ {
2
+ "request_rate_limit": 15,
3
+ "request_rate_limit_unit": "minute",
4
+ "request_rate_concurrency": 5,
5
+ "streams": [
6
+ {
7
+ "name": "scrape",
8
+ "action": "fetch",
9
+ "json_schema": {
10
+ "type": "object",
11
+ "properties": {
12
+ "markdown": {
13
+ "type": "string"
14
+ },
15
+ "metadata": {
16
+ "type": "string"
17
+ },
18
+ "url": {
19
+ "type": "string"
20
+ }
21
+ }
22
+ },
23
+ "supported_sync_modes": ["incremental"]
24
+ }
25
+ ]
26
+ }
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: multiwoven-integrations
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.30.1
4
+ version: 0.30.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Subin T P
@@ -752,6 +752,7 @@ files:
752
752
  - lib/multiwoven/integrations/source/databrics_model/config/spec.json
753
753
  - lib/multiwoven/integrations/source/databrics_model/icon.svg
754
754
  - lib/multiwoven/integrations/source/firecrawl/client.rb
755
+ - lib/multiwoven/integrations/source/firecrawl/config/catalog.json
755
756
  - lib/multiwoven/integrations/source/firecrawl/config/meta.json
756
757
  - lib/multiwoven/integrations/source/firecrawl/config/spec.json
757
758
  - lib/multiwoven/integrations/source/firecrawl/icon.svg