@intuned/browser-dev 0.1.15-dev.0 → 0.1.16-dev.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.prettierrc +3 -0
- package/dist/ai/isPageLoaded.js +6 -6
- package/dist/ai/tests/extractStructuredDataDomMatchingCache.spec.js +87 -0
- package/dist/ai/tests/testMatching.spec.js +38 -0
- package/dist/ai/tests/testValidateMatchesMapping.spec.js +58 -0
- package/dist/common/matching/matching.js +3 -3
- package/dist/common/xpathMapping.js +23 -10
- package/dist/helpers/downloadFile.js +3 -0
- package/dist/helpers/saveFileToS3.js +3 -0
- package/dist/helpers/tests/testDownloadFile.spec.js +16 -0
- package/dist/optimized-extractors/common/index.js +4 -4
- package/dist/optimized-extractors/common/matching/utils.js +4 -2
- package/dist/optimized-extractors/common/modelStringSupport.test.js +82 -0
- package/dist/optimized-extractors/export.d.ts +2 -50
- package/dist/optimized-extractors/extractArray.js +3 -1
- package/dist/optimized-extractors/index.d.ts +2 -50
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/dynamicListExtractor.spec.js +366 -1
- package/dist/optimized-extractors/listExtractionHelpers/__tests__/testArrayExtractorFromLocator.spec.js +43 -0
- package/dist/optimized-extractors/listExtractionHelpers/dynamicListExtractor.js +4 -3
- package/dist/optimized-extractors/listExtractionHelpers/runAiExtraction.js +31 -22
- package/dist/optimized-extractors/objectExtractionHelpers/AIExtractors.js +2 -1
- package/dist/optimized-extractors/objectExtractionHelpers/__tests__/testObjectExtractorFromLocator.spec.js +208 -0
- package/dist/optimized-extractors/objectExtractionHelpers/dynamicObjectExtractor.js +4 -2
- package/dist/optimized-extractors/validators.js +4 -5
- package/package.json +1 -1
|
@@ -2,7 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
var _extendedTest = require("../../../common/extendedTest");
|
|
4
4
|
var _ = require("../..");
|
|
5
|
+
var _neverthrow = require("neverthrow");
|
|
5
6
|
var _uuid = require("uuid");
|
|
7
|
+
function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
|
|
6
8
|
const productTemplate = `
|
|
7
9
|
<div class="product-page">
|
|
8
10
|
<div class="product-info">
|
|
@@ -104,4 +106,210 @@ _extendedTest.describe.skip("Object Extractor Caching Tests", () => {
|
|
|
104
106
|
console.log("All cache behavior tests completed successfully!");
|
|
105
107
|
});
|
|
106
108
|
});
|
|
109
|
+
});
|
|
110
|
+
async function mockExtractStructuredDataUsingAi(result) {
|
|
111
|
+
const commonModule = await Promise.resolve().then(() => _interopRequireWildcard(require("../../common")));
|
|
112
|
+
return _extendedTest.vi.spyOn(commonModule, "extractStructuredDataUsingAi").mockResolvedValue((0, _neverthrow.ok)({
|
|
113
|
+
result
|
|
114
|
+
}));
|
|
115
|
+
}
|
|
116
|
+
async function useInMemoryCache() {
|
|
117
|
+
const cacheModule = await Promise.resolve().then(() => _interopRequireWildcard(require("../../../intunedServices/cache")));
|
|
118
|
+
const store = new Map();
|
|
119
|
+
_extendedTest.vi.spyOn(cacheModule.cache, "get").mockImplementation(async key => store.has(key) ? store.get(key) : null);
|
|
120
|
+
_extendedTest.vi.spyOn(cacheModule.cache, "set").mockImplementation(async (key, value) => {
|
|
121
|
+
store.set(key, value);
|
|
122
|
+
});
|
|
123
|
+
}
|
|
124
|
+
(0, _extendedTest.describe)("extractObjectFromLocator - cache round-trip (real matcher, mocked AI)", () => {
|
|
125
|
+
(0, _extendedTest.afterEach)(() => {
|
|
126
|
+
_extendedTest.vi.restoreAllMocks();
|
|
127
|
+
});
|
|
128
|
+
(0, _extendedTest.test)("reuses the cache on a second identical call when the match was partial (PH: prefix)", async ({
|
|
129
|
+
page
|
|
130
|
+
}) => {
|
|
131
|
+
const label = `contact-roundtrip-${(0, _uuid.v4)()}`;
|
|
132
|
+
const aiResult = {
|
|
133
|
+
contact_information_name: "Andreas Jansson",
|
|
134
|
+
contact_information_phone_number: "(727) 471-4768",
|
|
135
|
+
contact_information_email: "andreas.jansson@flowbird.group"
|
|
136
|
+
};
|
|
137
|
+
const aiSpy = await mockExtractStructuredDataUsingAi(aiResult);
|
|
138
|
+
await useInMemoryCache();
|
|
139
|
+
await page.setContent(`
|
|
140
|
+
<div class="vendor">
|
|
141
|
+
<h2>Contact Information</h2>
|
|
142
|
+
<p class="c-name">Andreas Jansson</p>
|
|
143
|
+
<p class="c-phone">PH: (727) 471-4768</p>
|
|
144
|
+
<p class="c-email">andreas.jansson@flowbird.group</p>
|
|
145
|
+
</div>
|
|
146
|
+
`);
|
|
147
|
+
const options = {
|
|
148
|
+
entityName: "supplier_contacts",
|
|
149
|
+
label,
|
|
150
|
+
entitySchema: {
|
|
151
|
+
type: "object",
|
|
152
|
+
required: ["contact_information_email"],
|
|
153
|
+
properties: {
|
|
154
|
+
contact_information_name: {
|
|
155
|
+
type: "string",
|
|
156
|
+
description: "the contact person name"
|
|
157
|
+
},
|
|
158
|
+
contact_information_phone_number: {
|
|
159
|
+
type: "string",
|
|
160
|
+
description: "the contact phone number"
|
|
161
|
+
},
|
|
162
|
+
contact_information_email: {
|
|
163
|
+
type: "string",
|
|
164
|
+
description: "the contact email"
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
},
|
|
168
|
+
strategy: {
|
|
169
|
+
model: "claude-sonnet-4-20250514",
|
|
170
|
+
type: "HTML"
|
|
171
|
+
},
|
|
172
|
+
variantKey: label
|
|
173
|
+
};
|
|
174
|
+
const first = await (0, _.extractObjectFromLocator)(page.locator("div.vendor"), options);
|
|
175
|
+
(0, _extendedTest.expect)(first).toEqual(aiResult);
|
|
176
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
|
|
177
|
+
const second = await (0, _.extractObjectFromLocator)(page.locator("div.vendor"), options);
|
|
178
|
+
(0, _extendedTest.expect)(second).toEqual(first);
|
|
179
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
|
|
180
|
+
});
|
|
181
|
+
(0, _extendedTest.test)("reuses the cache when fields sit in separate direct text nodes of one element (br-separated block)", async ({
|
|
182
|
+
page
|
|
183
|
+
}) => {
|
|
184
|
+
const label = `vendor-block-${(0, _uuid.v4)()}`;
|
|
185
|
+
const vendorAiResult = {
|
|
186
|
+
primary_vendor_name: "V00000110 - International Cleaning Services, Inc.",
|
|
187
|
+
primary_vendor_contact_email: "monaco.intl@sbcglobal.net",
|
|
188
|
+
primary_vendor_contact_phone: "(630) 904-4118"
|
|
189
|
+
};
|
|
190
|
+
const aiSpy = await mockExtractStructuredDataUsingAi(vendorAiResult);
|
|
191
|
+
await useInMemoryCache();
|
|
192
|
+
await page.setContent(`
|
|
193
|
+
<table><tbody><tr><td class="vendor">
|
|
194
|
+
<a href="#">V00000110 - International Cleaning Services, Inc.</a>
|
|
195
|
+
2415 Comstock Court<br/>
|
|
196
|
+
Naperville, IL 60564<br/>
|
|
197
|
+
US<br/>
|
|
198
|
+
Email: monaco.intl@sbcglobal.net<br/>
|
|
199
|
+
FAX: (630) 904-4118
|
|
200
|
+
</td></tr></tbody></table>
|
|
201
|
+
`);
|
|
202
|
+
const options = {
|
|
203
|
+
entityName: "supplier_contacts",
|
|
204
|
+
label,
|
|
205
|
+
entitySchema: {
|
|
206
|
+
type: "object",
|
|
207
|
+
required: ["primary_vendor_name"],
|
|
208
|
+
properties: {
|
|
209
|
+
primary_vendor_name: {
|
|
210
|
+
type: "string",
|
|
211
|
+
description: "vendor id - company name"
|
|
212
|
+
},
|
|
213
|
+
primary_vendor_contact_email: {
|
|
214
|
+
type: "string",
|
|
215
|
+
description: "vendor email"
|
|
216
|
+
},
|
|
217
|
+
primary_vendor_contact_phone: {
|
|
218
|
+
type: "string",
|
|
219
|
+
description: "vendor phone"
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
},
|
|
223
|
+
strategy: {
|
|
224
|
+
model: "claude-sonnet-4-20250514",
|
|
225
|
+
type: "HTML"
|
|
226
|
+
},
|
|
227
|
+
variantKey: label
|
|
228
|
+
};
|
|
229
|
+
const first = await (0, _.extractObjectFromLocator)(page.locator("td.vendor"), options);
|
|
230
|
+
(0, _extendedTest.expect)(first).toEqual(vendorAiResult);
|
|
231
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
|
|
232
|
+
const second = await (0, _.extractObjectFromLocator)(page.locator("td.vendor"), options);
|
|
233
|
+
(0, _extendedTest.expect)(second).toEqual(first);
|
|
234
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
|
|
235
|
+
});
|
|
236
|
+
(0, _extendedTest.test)("reuses the cache for a link whose href is stored resolved but read raw (relative href)", async ({
|
|
237
|
+
page
|
|
238
|
+
}) => {
|
|
239
|
+
const label = `href-${(0, _uuid.v4)()}`;
|
|
240
|
+
const hrefAiResult = {
|
|
241
|
+
attachment_href: "https://example.com/files/annual-report.pdf",
|
|
242
|
+
attachment_name: "Annual Report"
|
|
243
|
+
};
|
|
244
|
+
const aiSpy = await mockExtractStructuredDataUsingAi(hrefAiResult);
|
|
245
|
+
await useInMemoryCache();
|
|
246
|
+
await page.setContent(`<!DOCTYPE html><html><head><base href="https://example.com/files/"></head><body>` + `<div class="doc"><a href="annual-report.pdf">Annual Report</a></div>` + `</body></html>`);
|
|
247
|
+
const options = {
|
|
248
|
+
entityName: "attachment",
|
|
249
|
+
label,
|
|
250
|
+
entitySchema: {
|
|
251
|
+
type: "object",
|
|
252
|
+
required: ["attachment_href"],
|
|
253
|
+
properties: {
|
|
254
|
+
attachment_href: {
|
|
255
|
+
type: "string",
|
|
256
|
+
description: "the file url (href)"
|
|
257
|
+
},
|
|
258
|
+
attachment_name: {
|
|
259
|
+
type: "string",
|
|
260
|
+
description: "the file name"
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
},
|
|
264
|
+
strategy: {
|
|
265
|
+
model: "claude-sonnet-4-20250514",
|
|
266
|
+
type: "HTML"
|
|
267
|
+
},
|
|
268
|
+
variantKey: label
|
|
269
|
+
};
|
|
270
|
+
const first = await (0, _.extractObjectFromLocator)(page.locator("div.doc"), options);
|
|
271
|
+
(0, _extendedTest.expect)(first).toEqual(hrefAiResult);
|
|
272
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
|
|
273
|
+
const second = await (0, _.extractObjectFromLocator)(page.locator("div.doc"), options);
|
|
274
|
+
(0, _extendedTest.expect)(second).toEqual(first);
|
|
275
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
|
|
276
|
+
});
|
|
277
|
+
(0, _extendedTest.test)("reuses the cache for a FUZZY match (AI value differs slightly from the DOM)", async ({
|
|
278
|
+
page
|
|
279
|
+
}) => {
|
|
280
|
+
const label = `fuzzy-${(0, _uuid.v4)()}`;
|
|
281
|
+
const fuzzyAiResult = {
|
|
282
|
+
company_name: "Internationl Cleaning Servces"
|
|
283
|
+
};
|
|
284
|
+
const aiSpy = await mockExtractStructuredDataUsingAi(fuzzyAiResult);
|
|
285
|
+
await useInMemoryCache();
|
|
286
|
+
await page.setContent(`<div class="vendor"><span class="name">International Cleaning Services</span></div>`);
|
|
287
|
+
const options = {
|
|
288
|
+
entityName: "vendor",
|
|
289
|
+
label,
|
|
290
|
+
entitySchema: {
|
|
291
|
+
type: "object",
|
|
292
|
+
required: ["company_name"],
|
|
293
|
+
properties: {
|
|
294
|
+
company_name: {
|
|
295
|
+
type: "string",
|
|
296
|
+
description: "the company name"
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
},
|
|
300
|
+
strategy: {
|
|
301
|
+
model: "claude-sonnet-4-20250514",
|
|
302
|
+
type: "HTML"
|
|
303
|
+
},
|
|
304
|
+
variantKey: label
|
|
305
|
+
};
|
|
306
|
+
const first = await (0, _.extractObjectFromLocator)(page.locator("div.vendor"), options);
|
|
307
|
+
(0, _extendedTest.expect)(first).toEqual({
|
|
308
|
+
company_name: "International Cleaning Services"
|
|
309
|
+
});
|
|
310
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
|
|
311
|
+
const second = await (0, _.extractObjectFromLocator)(page.locator("div.vendor"), options);
|
|
312
|
+
(0, _extendedTest.expect)(second).toEqual(first);
|
|
313
|
+
(0, _extendedTest.expect)(aiSpy).toHaveBeenCalledTimes(1);
|
|
314
|
+
});
|
|
107
315
|
});
|
|
@@ -88,11 +88,13 @@ async function dynamicObjectExtractor(page, identifier, options) {
|
|
|
88
88
|
Object.entries(xpathMappingFromAI).forEach(([_propertyName, {
|
|
89
89
|
matchXpath,
|
|
90
90
|
matchText,
|
|
91
|
-
matchType
|
|
91
|
+
matchType,
|
|
92
|
+
sourceText
|
|
92
93
|
}]) => {
|
|
93
94
|
xpathMapping[matchText] = [{
|
|
94
95
|
xpath: matchXpath,
|
|
95
|
-
matchType
|
|
96
|
+
matchType,
|
|
97
|
+
sourceText
|
|
96
98
|
}];
|
|
97
99
|
});
|
|
98
100
|
const resultsToCache = {
|
|
@@ -5,22 +5,21 @@ Object.defineProperty(exports, "__esModule", {
|
|
|
5
5
|
});
|
|
6
6
|
exports.strategySchema = exports.simpleObjectJsonSchema = exports.simpleArrayItemJsonSchema = exports.extractObjectOptimizedInputSchema = exports.extractArrayOptimizedInputSchema = void 0;
|
|
7
7
|
var _zod = require("zod");
|
|
8
|
-
var _aiModelsValidation = require("./types/aiModelsValidation");
|
|
9
8
|
const htmlStrategySchema = _zod.z.object({
|
|
10
|
-
model: _zod.z.
|
|
9
|
+
model: _zod.z.string({
|
|
11
10
|
required_error: "strategy model is required",
|
|
12
11
|
invalid_type_error: "strategy model is invalid"
|
|
13
|
-
}),
|
|
12
|
+
}).min(1, "strategy model is required"),
|
|
14
13
|
type: _zod.z.literal("HTML", {
|
|
15
14
|
required_error: "strategy type is required",
|
|
16
15
|
invalid_type_error: "strategy type is invalid"
|
|
17
16
|
})
|
|
18
17
|
});
|
|
19
18
|
const imageStrategySchema = _zod.z.object({
|
|
20
|
-
model: _zod.z.
|
|
19
|
+
model: _zod.z.string({
|
|
21
20
|
required_error: "strategy model is required",
|
|
22
21
|
invalid_type_error: "strategy model is invalid"
|
|
23
|
-
}),
|
|
22
|
+
}).min(1, "strategy model is required"),
|
|
24
23
|
type: _zod.z.literal("IMAGE", {
|
|
25
24
|
required_error: "strategy type is required",
|
|
26
25
|
invalid_type_error: "strategy type is invalid"
|