@awi-protocol/sdk 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +36 -8
- package/dist/index.d.ts +36 -8
- package/dist/index.js +365 -91
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +364 -97
- package/dist/index.mjs.map +1 -1
- package/package.json +29 -50
- package/src/compiler/grammar/axir-schema.gbnf +97 -0
package/dist/index.mjs
CHANGED
|
@@ -1,10 +1,3 @@
|
|
|
1
|
-
var __require = /* @__PURE__ */ ((x) => typeof require !== "undefined" ? require : typeof Proxy !== "undefined" ? new Proxy(x, {
|
|
2
|
-
get: (a, b) => (typeof require !== "undefined" ? require : a)[b]
|
|
3
|
-
}) : x)(function(x) {
|
|
4
|
-
if (typeof require !== "undefined") return require.apply(this, arguments);
|
|
5
|
-
throw Error('Dynamic require of "' + x + '" is not supported');
|
|
6
|
-
});
|
|
7
|
-
|
|
8
1
|
// src/client.ts
|
|
9
2
|
import fetch from "cross-fetch";
|
|
10
3
|
var AWIError = class extends Error {
|
|
@@ -209,7 +202,6 @@ var AWIClient = class {
|
|
|
209
202
|
throw lastError || new AWIError("MAX_RETRIES", "Max retries exceeded", 502);
|
|
210
203
|
}
|
|
211
204
|
};
|
|
212
|
-
var client_default = AWIClient;
|
|
213
205
|
|
|
214
206
|
// src/advisory-executor.ts
|
|
215
207
|
var AdvisoryExecutor = class {
|
|
@@ -440,27 +432,312 @@ var AdvisoryExecutor = class {
|
|
|
440
432
|
}
|
|
441
433
|
};
|
|
442
434
|
|
|
435
|
+
// src/compiler/axir-compiler.ts
|
|
436
|
+
import * as cheerio from "cheerio";
|
|
437
|
+
var AXIRCompiler = class {
|
|
438
|
+
$;
|
|
439
|
+
intent;
|
|
440
|
+
params;
|
|
441
|
+
domain;
|
|
442
|
+
constructor(html, options) {
|
|
443
|
+
this.$ = cheerio.load(html);
|
|
444
|
+
this.intent = options.intent;
|
|
445
|
+
this.params = options.params || {};
|
|
446
|
+
this.domain = options.domain || "unknown";
|
|
447
|
+
}
|
|
448
|
+
compile() {
|
|
449
|
+
const start = Date.now();
|
|
450
|
+
this.simplifyDOM();
|
|
451
|
+
const regions = this.identifyRegions();
|
|
452
|
+
const target = this.routeIntent(regions);
|
|
453
|
+
return {
|
|
454
|
+
workflow: this.buildWorkflow(target, regions),
|
|
455
|
+
intents: this.mapIntents(),
|
|
456
|
+
selectors: this.generateSelectors(target),
|
|
457
|
+
fields: this.generateFields(target),
|
|
458
|
+
container: target.container,
|
|
459
|
+
model_used: "axir-deterministic-v1",
|
|
460
|
+
tokens_used: 0,
|
|
461
|
+
compilation_time_ms: Date.now() - start
|
|
462
|
+
};
|
|
463
|
+
}
|
|
464
|
+
simplifyDOM() {
|
|
465
|
+
this.$("script, style, svg, noscript, iframe, canvas, video, audio").remove();
|
|
466
|
+
for (const el of this.$("div, span").toArray()) {
|
|
467
|
+
const $el = this.$(el);
|
|
468
|
+
if ($el.children().length === 0 && $el.text().trim() === "") $el.remove();
|
|
469
|
+
}
|
|
470
|
+
this.$('[style*="display:none"], [style*="display: none"], [hidden], [aria-hidden="true"]').remove();
|
|
471
|
+
}
|
|
472
|
+
identifyRegions() {
|
|
473
|
+
const regions = [];
|
|
474
|
+
for (const el of this.$('form, [role="search"], input[type="search"]').toArray()) {
|
|
475
|
+
const r = this.analyzeSearchRegion(this.$(el));
|
|
476
|
+
if (r) regions.push(r);
|
|
477
|
+
}
|
|
478
|
+
for (const el of this.$('nav, [role="navigation"], header, .nav, .navbar, .menu').toArray()) {
|
|
479
|
+
regions.push({ type: "navigation", element: this.$(el), confidence: 0.9 });
|
|
480
|
+
}
|
|
481
|
+
for (const el of this.$('ul, ol, [role="list"], .list, .results, .items, table, [role="grid"]').toArray()) {
|
|
482
|
+
const $el = this.$(el);
|
|
483
|
+
if ($el.find('li, tr, .item, [role="listitem"]').length > 1) {
|
|
484
|
+
regions.push({ type: "listing", element: $el, confidence: 0.85 });
|
|
485
|
+
}
|
|
486
|
+
}
|
|
487
|
+
for (const el of this.$("form").toArray()) {
|
|
488
|
+
const $el = this.$(el);
|
|
489
|
+
if (!regions.some((r) => r.element.is($el))) regions.push({ type: "form", element: $el, confidence: 0.9 });
|
|
490
|
+
}
|
|
491
|
+
for (const el of this.$(".pagination, .pager, .pages").toArray()) {
|
|
492
|
+
if (this.isPagination(this.$(el))) regions.push({ type: "pagination", element: this.$(el), confidence: 0.8 });
|
|
493
|
+
}
|
|
494
|
+
for (const el of this.$('article, [role="article"], .content, .main, main, .detail').toArray()) {
|
|
495
|
+
regions.push({ type: "detail", element: this.$(el), confidence: 0.75 });
|
|
496
|
+
}
|
|
497
|
+
return regions;
|
|
498
|
+
}
|
|
499
|
+
analyzeSearchRegion($el) {
|
|
500
|
+
const hasInput = $el.find('input[type="text"], input[type="search"], input:not([type])').length > 0;
|
|
501
|
+
const hasButton = $el.find('button, input[type="submit"]').length > 0;
|
|
502
|
+
if (hasInput || hasButton) return { type: "search", element: $el, confidence: hasInput && hasButton ? 0.95 : 0.7 };
|
|
503
|
+
return null;
|
|
504
|
+
}
|
|
505
|
+
isPagination($el) {
|
|
506
|
+
const text = $el.text().toLowerCase();
|
|
507
|
+
return /\d+/.test(text) && (/next|>|\u203a|\u2192|\u00bb/.test(text) || /prev|previous|<|\u2039|\u2190|\u00ab/.test(text));
|
|
508
|
+
}
|
|
509
|
+
routeIntent(regions) {
|
|
510
|
+
const intentMap = {
|
|
511
|
+
search: ["search", "form"],
|
|
512
|
+
search_jobs: ["search", "listing", "form"],
|
|
513
|
+
extract_list: ["listing", "search", "detail"],
|
|
514
|
+
extract_detail: ["detail", "listing"],
|
|
515
|
+
fill_form: ["form", "search"],
|
|
516
|
+
navigate: ["navigation", "listing"],
|
|
517
|
+
login: ["form"],
|
|
518
|
+
filter: ["search", "listing"],
|
|
519
|
+
sort: ["listing", "search"],
|
|
520
|
+
scrape: ["listing", "detail", "search"]
|
|
521
|
+
};
|
|
522
|
+
const targetTypes = intentMap[this.intent.toLowerCase()] || ["search", "listing", "form"];
|
|
523
|
+
let best = null;
|
|
524
|
+
let bestScore = 0;
|
|
525
|
+
for (const r of regions) {
|
|
526
|
+
const match = targetTypes.indexOf(r.type);
|
|
527
|
+
const score = match >= 0 ? (targetTypes.length - match) * r.confidence : 0;
|
|
528
|
+
if (score > bestScore) {
|
|
529
|
+
bestScore = score;
|
|
530
|
+
best = r;
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
if (!best) best = this.findLargestRegion(regions);
|
|
534
|
+
return { region: best, container: this.generateContainerSelector(best.element) };
|
|
535
|
+
}
|
|
536
|
+
findLargestRegion(regions) {
|
|
537
|
+
if (regions.length === 0) return { type: "unknown", element: this.$("body"), confidence: 0.5 };
|
|
538
|
+
return regions.reduce((l, c) => c.element.find("*").length > l.element.find("*").length ? c : l);
|
|
539
|
+
}
|
|
540
|
+
buildWorkflow(_target, all) {
|
|
541
|
+
const nodes = {};
|
|
542
|
+
const edges = [];
|
|
543
|
+
const entry = [];
|
|
544
|
+
const exit = [];
|
|
545
|
+
all.forEach((r, i) => {
|
|
546
|
+
const id = `${r.type}_${i}`;
|
|
547
|
+
const raw = r.element[0];
|
|
548
|
+
nodes[id] = {
|
|
549
|
+
node_id: id,
|
|
550
|
+
element_type: this.mapType(r.type),
|
|
551
|
+
semantic_role: r.type,
|
|
552
|
+
intent: this.intent,
|
|
553
|
+
tag: raw?.tagName?.toLowerCase(),
|
|
554
|
+
selector_candidates: this.buildCandidates(r.element),
|
|
555
|
+
confidence: r.confidence
|
|
556
|
+
};
|
|
557
|
+
if (r.type === "navigation") entry.push(id);
|
|
558
|
+
if (r.type === "listing" || r.type === "detail") exit.push(id);
|
|
559
|
+
});
|
|
560
|
+
all.forEach((f, fi) => all.forEach((t, ti) => {
|
|
561
|
+
if (fi !== ti) {
|
|
562
|
+
const e = this.inferEdge(f, t, fi, ti);
|
|
563
|
+
if (e) edges.push(e);
|
|
564
|
+
}
|
|
565
|
+
}));
|
|
566
|
+
if (entry.length === 0 && Object.keys(nodes).length > 0) entry.push(Object.keys(nodes)[0]);
|
|
567
|
+
return { nodes, edges, entry_points: entry, exit_points: exit, domain: this.domain, page_type: this.inferPageType(all) };
|
|
568
|
+
}
|
|
569
|
+
mapType(t) {
|
|
570
|
+
const m = {
|
|
571
|
+
search: "search",
|
|
572
|
+
navigation: "navigation",
|
|
573
|
+
listing: "list",
|
|
574
|
+
form: "form",
|
|
575
|
+
pagination: "pagination",
|
|
576
|
+
detail: "container"
|
|
577
|
+
};
|
|
578
|
+
return m[t] || "unknown";
|
|
579
|
+
}
|
|
580
|
+
inferEdge(f, t, fi, ti) {
|
|
581
|
+
if (f.type === "search" && t.type === "listing") return { from_node: `search_${fi}`, to_node: `listing_${ti}`, action: "submit_search", probability: 0.9 };
|
|
582
|
+
if (f.type === "navigation" && t.type === "search") return { from_node: `navigation_${fi}`, to_node: `search_${ti}`, action: "navigate_to_search", probability: 0.7 };
|
|
583
|
+
if (f.type === "listing" && t.type === "pagination") return { from_node: `listing_${fi}`, to_node: `pagination_${ti}`, action: "next_page", probability: 0.8 };
|
|
584
|
+
if (f.type === "pagination" && t.type === "listing") return { from_node: `pagination_${fi}`, to_node: `listing_${ti}`, action: "load_results", probability: 0.95 };
|
|
585
|
+
return null;
|
|
586
|
+
}
|
|
587
|
+
inferPageType(regions) {
|
|
588
|
+
const t = regions.map((r) => r.type);
|
|
589
|
+
if (t.includes("search") && t.includes("listing")) return "search";
|
|
590
|
+
if (t.includes("listing")) return "listing";
|
|
591
|
+
if (t.includes("form")) return "form";
|
|
592
|
+
if (t.includes("search")) return "search";
|
|
593
|
+
if (t.includes("navigation")) return "landing";
|
|
594
|
+
return "unknown";
|
|
595
|
+
}
|
|
596
|
+
generateSelectors(target) {
|
|
597
|
+
const s = {};
|
|
598
|
+
const $el = target.region.element;
|
|
599
|
+
s.container = this.buildCandidates($el);
|
|
600
|
+
for (const el of $el.find("input, textarea, select").toArray()) {
|
|
601
|
+
const n = this.inferFieldName(this.$(el));
|
|
602
|
+
if (n) s[n] = this.buildCandidates(this.$(el));
|
|
603
|
+
}
|
|
604
|
+
for (const el of $el.find('button, input[type="submit"], input[type="button"]').toArray()) {
|
|
605
|
+
const $btn = this.$(el);
|
|
606
|
+
const label = $btn.text().trim() || String($btn.val() || "button");
|
|
607
|
+
s[`btn_${this.slugify(label)}`] = this.buildCandidates($btn);
|
|
608
|
+
}
|
|
609
|
+
for (const el of $el.find("a").toArray()) {
|
|
610
|
+
const $a = this.$(el);
|
|
611
|
+
const t = $a.text().trim();
|
|
612
|
+
if (t && t.length < 50) s[`link_${this.slugify(t)}`] = this.buildCandidates($a);
|
|
613
|
+
}
|
|
614
|
+
return s;
|
|
615
|
+
}
|
|
616
|
+
buildCandidates($el) {
|
|
617
|
+
const c = [];
|
|
618
|
+
const el = $el[0];
|
|
619
|
+
if (!el) return c;
|
|
620
|
+
const id = $el.attr("id");
|
|
621
|
+
if (id && !id.match(/^\d/)) c.push({ type: "css", value: `#${this.escape(id)}`, priority: 1, confidence: 0.99 });
|
|
622
|
+
const classes = ($el.attr("class") || "").split(/\s+/).filter((x) => x && !x.match(/^js-|^ng-|^vue-|^data-/));
|
|
623
|
+
if (classes.length) c.push({ type: "css", value: `.${classes.map((x) => this.escape(x)).join(".")}`, priority: 2, confidence: 0.85 });
|
|
624
|
+
const raw = el;
|
|
625
|
+
const tag = raw.tagName?.toLowerCase() || "";
|
|
626
|
+
const name = $el.attr("name");
|
|
627
|
+
const type = $el.attr("type");
|
|
628
|
+
const placeholder = $el.attr("placeholder");
|
|
629
|
+
if (name) c.push({ type: "css", value: `${tag}[name="${this.q(name)}"]`, priority: 3, confidence: 0.9 });
|
|
630
|
+
if (type) c.push({ type: "css", value: `${tag}[type="${type}"]`, priority: 4, confidence: 0.8 });
|
|
631
|
+
if (placeholder) c.push({ type: "css", value: `${tag}[placeholder="${this.q(placeholder)}"]`, priority: 5, confidence: 0.75 });
|
|
632
|
+
const role = $el.attr("role");
|
|
633
|
+
if (role) c.push({ type: "semantic", value: `[role="${role}"]`, priority: 6, confidence: 0.9 });
|
|
634
|
+
const al = $el.attr("aria-label");
|
|
635
|
+
if (al) c.push({ type: "semantic", value: `[aria-label="${this.q(al)}"]`, priority: 7, confidence: 0.85 });
|
|
636
|
+
const text = $el.text().trim();
|
|
637
|
+
if (text && text.length < 100) c.push({ type: "text", value: text, priority: 8, confidence: 0.7 });
|
|
638
|
+
return c;
|
|
639
|
+
}
|
|
640
|
+
generateFields(target) {
|
|
641
|
+
const f = [];
|
|
642
|
+
for (const el of target.region.element.find("input, textarea, select").toArray()) {
|
|
643
|
+
const $el = this.$(el);
|
|
644
|
+
const name = this.inferFieldName($el);
|
|
645
|
+
if (!name) continue;
|
|
646
|
+
f.push({
|
|
647
|
+
name,
|
|
648
|
+
selector: this.bestSelector($el),
|
|
649
|
+
transform: this.inferTransform($el),
|
|
650
|
+
required: $el.attr("required") !== void 0
|
|
651
|
+
});
|
|
652
|
+
}
|
|
653
|
+
return f;
|
|
654
|
+
}
|
|
655
|
+
inferFieldName($el) {
|
|
656
|
+
const id = $el.attr("id");
|
|
657
|
+
if (id) {
|
|
658
|
+
const $l = this.$(`label[for="${id}"]`);
|
|
659
|
+
if ($l.length) return this.slugify($l.text());
|
|
660
|
+
}
|
|
661
|
+
const ph = $el.attr("placeholder");
|
|
662
|
+
if (ph) return this.slugify(ph);
|
|
663
|
+
const al = $el.attr("aria-label");
|
|
664
|
+
if (al) return this.slugify(al);
|
|
665
|
+
const n = $el.attr("name");
|
|
666
|
+
if (n) return this.slugify(n);
|
|
667
|
+
return null;
|
|
668
|
+
}
|
|
669
|
+
inferTransform($el) {
|
|
670
|
+
const t = $el.attr("type");
|
|
671
|
+
if (t === "number") return "number";
|
|
672
|
+
if (t === "email") return "email";
|
|
673
|
+
if (t === "date") return "date";
|
|
674
|
+
if (t === "checkbox") return "boolean";
|
|
675
|
+
if ($el.is("select")) return "select";
|
|
676
|
+
return void 0;
|
|
677
|
+
}
|
|
678
|
+
bestSelector($el) {
|
|
679
|
+
const c = this.buildCandidates($el);
|
|
680
|
+
if (c.length) return c[0].value;
|
|
681
|
+
const raw = $el[0];
|
|
682
|
+
return raw?.tagName?.toLowerCase() || "*";
|
|
683
|
+
}
|
|
684
|
+
generateContainerSelector($el) {
|
|
685
|
+
const c = this.buildCandidates($el);
|
|
686
|
+
return c.length ? c[0].value : "body";
|
|
687
|
+
}
|
|
688
|
+
mapIntents() {
|
|
689
|
+
const m = {
|
|
690
|
+
search: { intent: "search", action: "fill_and_submit", parameters: ["query", "location", "filters"], context: "Enter search terms and submit form" },
|
|
691
|
+
search_jobs: { intent: "search_jobs", action: "fill_and_submit", parameters: ["query", "location", "experience_level", "job_type"], context: "Search for job listings with optional filters" },
|
|
692
|
+
extract_list: { intent: "extract_list", action: "extract_fields", parameters: ["items", "title", "url", "metadata"], context: "Extract structured data from list items" },
|
|
693
|
+
extract_detail: { intent: "extract_detail", action: "extract_fields", parameters: ["title", "description", "metadata", "links"], context: "Extract structured data from detail page" },
|
|
694
|
+
fill_form: { intent: "fill_form", action: "fill_and_submit", parameters: Object.keys(this.params), context: "Fill form fields with provided parameters" },
|
|
695
|
+
navigate: { intent: "navigate", action: "click", parameters: ["target_url", "link_text"], context: "Click navigation link to target page" },
|
|
696
|
+
login: { intent: "login", action: "fill_and_submit", parameters: ["username", "password"], context: "Enter credentials and submit login form" },
|
|
697
|
+
scrape: { intent: "scrape", action: "extract_fields", parameters: ["all_visible_text", "links", "images", "structured_data"], context: "Extract all visible content from the page" }
|
|
698
|
+
};
|
|
699
|
+
const mapped = m[this.intent.toLowerCase()];
|
|
700
|
+
if (mapped) return [mapped];
|
|
701
|
+
return [{ intent: this.intent, action: "interact", parameters: Object.keys(this.params), context: `Perform ${this.intent} on the page` }];
|
|
702
|
+
}
|
|
703
|
+
slugify(t) {
|
|
704
|
+
return t.toLowerCase().replace(/[^\w\s-]/g, "").replace(/[\s_-]+/g, "_").replace(/^_|_$/g, "").substring(0, 50);
|
|
705
|
+
}
|
|
706
|
+
escape(s) {
|
|
707
|
+
return s.replace(/([:.])/g, "\\$1");
|
|
708
|
+
}
|
|
709
|
+
q(s) {
|
|
710
|
+
return s.replace(/"/g, '\\"');
|
|
711
|
+
}
|
|
712
|
+
};
|
|
713
|
+
|
|
443
714
|
// src/compiler/local-axir.ts
|
|
444
715
|
import * as fs from "fs";
|
|
445
716
|
import * as path from "path";
|
|
446
717
|
import * as os from "os";
|
|
447
718
|
import * as https from "https";
|
|
448
719
|
import * as http from "http";
|
|
449
|
-
|
|
450
|
-
// src/compiler/grammar/axir-schema.gbnf
|
|
451
|
-
var axir_schema_default = 'root ::= "{" ws axir-fields ws "}"\n\naxir-fields ::=\n "\\"workflow\\"" ":" workflow ws ","\n ws "\\"intents\\"" ":" intents ws ","\n ws "\\"selectors\\"" ":" selectors ws ","\n ws "\\"fields\\"" ":" fields\n ["," ws "\\"container\\"" ":" string]\n ["," ws "\\"model_used\\"" ":" string]\n ["," ws "\\"tokens_used\\"" ":" number]\n ["," ws "\\"compilation_time_ms\\"" ":" number]\n\nworkflow ::=\n "{" ws\n "\\"nodes\\"" ":" "{" ws node-list ws "}" ws ","\n ws "\\"edges\\"" ":" "[" ws edge-list ws "]" ws ","\n ws "\\"entry_points\\"" ":" "[" ws string-list ws "]" ws ","\n ws "\\"exit_points\\"" ":" "[" ws string-list ws "]" ws ","\n ws "\\"domain\\"" ":" string ws ","\n ws "\\"page_type\\"" ":" page-type\n ["," ws "\\"structure_hash\\"" ":" string]\n ws "}"\n\nnode-list ::= [node-pair ("," ws node-pair)*]\nnode-pair ::= string ":" "{" ws\n "\\"element_type\\"" ":" element-type ws ","\n ws "\\"semantic_role\\"" ":" string ws ","\n ws "\\"intent\\"" ":" string ws ","\n ws "\\"tag\\"" ":" string ws ","\n ws "\\"selector_candidates\\"" ":" "[" ws selector-list ws "]"\n ["," ws "\\"parent_id\\"" ":" string]\n ["," ws "\\"children_ids\\"" ":" "[" ws string-list ws "]"]\n ["," ws "\\"aria_label\\"" ":" string]\n ["," ws "\\"aria_role\\"" ":" string]\n ["," ws "\\"text_content\\"" ":" string]\n "," ws "\\"confidence\\"" ":" number\n ["," ws "\\"reasoning\\"" ":" string]\nws "}"\n\nelement-type ::=\n "\\"button\\"" | "\\"link\\"" | "\\"input\\"" | "\\"form\\"" |\n "\\"navigation\\"" | "\\"search\\"" | "\\"filter\\"" | "\\"sort\\"" |\n "\\"pagination\\"" | "\\"container\\"" | "\\"list\\"" | "\\"item\\"" |\n "\\"heading\\"" | "\\"text\\"" | "\\"image\\"" | "\\"unknown\\""\n\npage-type ::=\n "\\"landing\\"" | "\\"search\\"" | "\\"listing\\"" | "\\"detail\\"" |\n "\\"form\\"" | "\\"checkout\\"" | "\\"dashboard\\"" | "\\"unknown\\""\n\nedge-list ::= [edge ("," ws edge)*]\nedge ::= "{" ws\n "\\"from_node\\"" ":" string ws ","\n ws "\\"to_node\\"" ":" string ws ","\n ws "\\"action\\"" ":" string\n ["," ws "\\"condition\\"" ":" string]\n ["," ws "\\"probability\\"" ":" number]\nws "}"\n\nselector-list ::= [selector ("," ws selector)*]\nselector ::= "{" ws\n "\\"type\\"" ":" selector-type ws ","\n ws "\\"value\\"" ":" string ws ","\n ws "\\"priority\\"" ":" number\n ["," ws "\\"confidence\\"" ":" number]\nws "}"\n\nselector-type ::= "\\"css\\"" | "\\"semantic\\"" | "\\"text\\"" | "\\"attribute\\""\n\nintents ::= "[" ws [intent ("," ws intent)*] ws "]"\nintent ::= "{" ws\n "\\"intent\\"" ":" string ws ","\n ws "\\"action\\"" ":" string ws ","\n ws "\\"parameters\\"" ":" "[" ws [string ("," ws string)*] ws "]" ws ","\n ws "\\"context\\"" ":" string\nws "}"\n\nfields ::= "[" ws [field ("," ws field)*] ws "]"\nfield ::= "{" ws\n "\\"name\\"" ":" string ws ","\n ws "\\"selector\\"" ":" string\n ["," ws "\\"transform\\"" ":" string]\n ["," ws "\\"required\\"" ":" boolean]\nws "}"\n\nselectors ::= "{" ws [selector-pair ("," ws selector-pair)*] ws "}"\nselector-pair ::= string ":" "[" ws selector-list ws "]"\n\nstring-list ::= [string ("," ws string)*]\n\nstring ::= "\\"" char* "\\""\nchar ::= [^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F]{4})\n\nnumber ::= ["-"]? ("0" | [1-9] [0-9]*) ([.] [0-9]+)? ([eE] ["-"]? [0-9]+)?\n\nboolean ::= "true" | "false"\n\nws ::= [ \\t\\n\\r]*\n';
|
|
452
|
-
|
|
453
|
-
// src/compiler/local-axir.ts
|
|
720
|
+
import { URL } from "url";
|
|
454
721
|
var nativeAvailable = false;
|
|
455
722
|
var getLlama;
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
723
|
+
var LlamaModel;
|
|
724
|
+
var LlamaContext;
|
|
725
|
+
var LlamaGrammar;
|
|
726
|
+
var llamaPromise = import("node-llama-cpp").then((m) => {
|
|
459
727
|
nativeAvailable = true;
|
|
460
|
-
|
|
728
|
+
getLlama = m.getLlama;
|
|
729
|
+
LlamaModel = m.LlamaModel;
|
|
730
|
+
LlamaContext = m.LlamaContext;
|
|
731
|
+
LlamaGrammar = m.LlamaGrammar;
|
|
732
|
+
return m;
|
|
733
|
+
}).catch((err) => {
|
|
461
734
|
nativeAvailable = false;
|
|
462
|
-
|
|
463
|
-
|
|
735
|
+
if (process.env.AWI_DEBUG) {
|
|
736
|
+
console.error("[AWI] node-llama-cpp load error:", err.message);
|
|
737
|
+
}
|
|
738
|
+
return null;
|
|
739
|
+
});
|
|
740
|
+
var DEFAULT_MODEL_URL = "https://github.com/RayAKaan/AWI/releases/download/v0.0.0-models/Phi-3-mini-128k-instruct-Q4_K_M.gguf";
|
|
464
741
|
var DEFAULT_MODEL_FILENAME = "phi3-128k-q4.gguf";
|
|
465
742
|
var LocalAXIRCompiler = class {
|
|
466
743
|
modelPath;
|
|
@@ -487,14 +764,10 @@ var LocalAXIRCompiler = class {
|
|
|
487
764
|
this.onDownloadProgress = options.onDownloadProgress;
|
|
488
765
|
this.onStatus = options.onStatus;
|
|
489
766
|
}
|
|
490
|
-
// -------------------------------------------------------------------------
|
|
491
|
-
// Public API
|
|
492
|
-
// -------------------------------------------------------------------------
|
|
493
767
|
async compile(domHTML, a11yTree, intent, params) {
|
|
494
768
|
await this._ensureModel();
|
|
495
769
|
await this._ensureGrammar();
|
|
496
|
-
const
|
|
497
|
-
const prompt = this._buildCompilePrompt(domHTML, a11y, intent, params);
|
|
770
|
+
const prompt = this._buildCompilePrompt(domHTML, a11yTree, intent, params);
|
|
498
771
|
const start = Date.now();
|
|
499
772
|
this._status("Compiling AXIR locally...");
|
|
500
773
|
const resultText = await this._complete(prompt, 4096, 0.3);
|
|
@@ -537,22 +810,26 @@ var LocalAXIRCompiler = class {
|
|
|
537
810
|
this.ready = false;
|
|
538
811
|
}
|
|
539
812
|
}
|
|
540
|
-
// -------------------------------------------------------------------------
|
|
541
|
-
// Model lifecycle
|
|
542
|
-
// -------------------------------------------------------------------------
|
|
543
813
|
async _ensureModel() {
|
|
544
814
|
if (this.ready) return;
|
|
815
|
+
await llamaPromise;
|
|
816
|
+
if (!getLlama) {
|
|
817
|
+
throw new Error("node-llama-cpp failed to load. Is it installed?");
|
|
818
|
+
}
|
|
545
819
|
if (!fs.existsSync(this.modelPath)) {
|
|
546
820
|
await this._downloadModel();
|
|
547
821
|
}
|
|
548
822
|
this._status("Loading local model...");
|
|
549
823
|
const llama = await getLlama();
|
|
550
824
|
const gpuLayers = this.gpuLayers ?? this._autoDetectGPULayers();
|
|
551
|
-
this.model = new
|
|
825
|
+
this.model = new LlamaModel({
|
|
826
|
+
llama,
|
|
552
827
|
modelPath: this.modelPath,
|
|
553
828
|
gpuLayers
|
|
554
829
|
});
|
|
555
|
-
this.context =
|
|
830
|
+
this.context = new LlamaContext({
|
|
831
|
+
llama,
|
|
832
|
+
model: this.model,
|
|
556
833
|
contextSize: this.contextSize
|
|
557
834
|
});
|
|
558
835
|
this.ready = true;
|
|
@@ -560,9 +837,10 @@ var LocalAXIRCompiler = class {
|
|
|
560
837
|
}
|
|
561
838
|
async _ensureGrammar() {
|
|
562
839
|
if (this.grammar) return;
|
|
563
|
-
const
|
|
564
|
-
this.grammar = new
|
|
565
|
-
|
|
840
|
+
const grammarPath = path.join(__dirname, "grammar", "axir-schema.gbnf");
|
|
841
|
+
this.grammar = new LlamaGrammar({
|
|
842
|
+
llama: await getLlama(),
|
|
843
|
+
grammar: fs.readFileSync(grammarPath, "utf-8")
|
|
566
844
|
});
|
|
567
845
|
}
|
|
568
846
|
_autoDetectGPULayers() {
|
|
@@ -571,17 +849,14 @@ var LocalAXIRCompiler = class {
|
|
|
571
849
|
}
|
|
572
850
|
return 0;
|
|
573
851
|
}
|
|
574
|
-
// -------------------------------------------------------------------------
|
|
575
|
-
// Resumable model download
|
|
576
|
-
// -------------------------------------------------------------------------
|
|
577
852
|
async _downloadModel() {
|
|
578
853
|
const dir = path.dirname(this.modelPath);
|
|
579
854
|
if (!fs.existsSync(dir)) {
|
|
580
855
|
fs.mkdirSync(dir, { recursive: true });
|
|
581
856
|
}
|
|
582
857
|
const tempPath = `${this.modelPath}.tmp`;
|
|
583
|
-
const
|
|
584
|
-
const protocol =
|
|
858
|
+
const url = new URL(this.modelUrl);
|
|
859
|
+
const protocol = url.protocol === "https:" ? https : http;
|
|
585
860
|
let startByte = 0;
|
|
586
861
|
if (fs.existsSync(tempPath)) {
|
|
587
862
|
startByte = fs.statSync(tempPath).size;
|
|
@@ -597,10 +872,10 @@ var LocalAXIRCompiler = class {
|
|
|
597
872
|
headers["Range"] = `bytes=${startByte}-`;
|
|
598
873
|
}
|
|
599
874
|
const request = protocol.get(
|
|
600
|
-
|
|
875
|
+
url,
|
|
601
876
|
{ headers },
|
|
602
877
|
(response) => {
|
|
603
|
-
if (response.statusCode ===
|
|
878
|
+
if (response.statusCode === 302 || response.statusCode === 301) {
|
|
604
879
|
if (response.headers.location) {
|
|
605
880
|
this.modelUrl = response.headers.location;
|
|
606
881
|
return this._downloadModel().then(resolve).catch(reject);
|
|
@@ -608,7 +883,7 @@ var LocalAXIRCompiler = class {
|
|
|
608
883
|
}
|
|
609
884
|
if (response.statusCode !== 200 && response.statusCode !== 206) {
|
|
610
885
|
return reject(
|
|
611
|
-
new Error(`
|
|
886
|
+
new Error(`Download failed: HTTP ${response.statusCode}`)
|
|
612
887
|
);
|
|
613
888
|
}
|
|
614
889
|
const total = parseInt(
|
|
@@ -642,76 +917,69 @@ var LocalAXIRCompiler = class {
|
|
|
642
917
|
});
|
|
643
918
|
});
|
|
644
919
|
}
|
|
645
|
-
// -------------------------------------------------------------------------
|
|
646
|
-
// Inference
|
|
647
|
-
// -------------------------------------------------------------------------
|
|
648
920
|
async _complete(prompt, maxTokens, temperature) {
|
|
649
921
|
if (!this.context) throw new Error("Model not loaded");
|
|
650
922
|
const sequence = this.context.getSequence();
|
|
651
923
|
await sequence.evaluate(prompt);
|
|
652
|
-
const
|
|
924
|
+
const result = await sequence.generateResponse(maxTokens, {
|
|
653
925
|
temperature,
|
|
654
926
|
grammar: this.grammar
|
|
655
927
|
});
|
|
656
928
|
let text = "";
|
|
657
|
-
for await (const token of
|
|
929
|
+
for await (const token of result) {
|
|
658
930
|
text += token;
|
|
659
931
|
}
|
|
660
932
|
return text;
|
|
661
933
|
}
|
|
662
|
-
// -------------------------------------------------------------------------
|
|
663
|
-
// Prompt builders
|
|
664
|
-
// -------------------------------------------------------------------------
|
|
665
934
|
_buildCompilePrompt(domHTML, a11yTree, intent, params) {
|
|
666
935
|
const paramsJson = params ? JSON.stringify(params, null, 2) : "{}";
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
936
|
+
const a11y = a11yTree || "No accessibility tree available.";
|
|
937
|
+
return `|<|system|>
|
|
938
|
+
You are an expert web-scraping analyst. Your job is to read a simplified DOM and accessibility tree, then output a structured JSON object describing the page layout, interactive elements, and data extraction plan.
|
|
939
|
+
|
|
940
|
+
Output MUST be valid JSON matching this schema:
|
|
941
|
+
- workflow.nodes: map of node_id -> {element_type, semantic_role, intent, tag, selector_candidates[], parent_id?, children_ids?, aria_label?, aria_role?, text_content?, confidence, reasoning?}
|
|
942
|
+
- workflow.edges: list of {from_node, to_node, action, condition?, probability}
|
|
943
|
+
- workflow.entry_points: list of starting node_ids
|
|
944
|
+
- workflow.exit_points: list of terminal node_ids
|
|
945
|
+
- workflow.domain: the domain name
|
|
946
|
+
- workflow.page_type: one of landing|search|listing|detail|form|checkout|dashboard|unknown
|
|
947
|
+
- intents: list of {intent, action, parameters[], context}
|
|
948
|
+
- selectors: map of selector_name -> list of {type, value, priority}
|
|
949
|
+
- fields: list of {name, selector, transform?, required}
|
|
950
|
+
- container?: string (optional container selector name)
|
|
951
|
+
|
|
952
|
+
Element types: button, link, input, form, navigation, search, filter, sort, pagination, container, list, item, heading, text, image, unknown.
|
|
953
|
+
Selector types: css, semantic, text, attribute.
|
|
954
|
+
|<|user|>
|
|
955
|
+
Intent: ${intent}
|
|
956
|
+
Parameters: ${paramsJson}
|
|
957
|
+
|
|
958
|
+
Simplified DOM:
|
|
959
|
+
${this._truncate(domHTML, 4e4)}
|
|
960
|
+
|
|
961
|
+
Accessibility Tree:
|
|
962
|
+
${this._truncate(a11y, 8e3)}
|
|
963
|
+
|
|
964
|
+
Compile AXIR:
|
|
965
|
+
|<|assistant|>
|
|
966
|
+
`;
|
|
698
967
|
}
|
|
699
968
|
_buildHealPrompt(domHTML, brokenSelector, semanticIntent) {
|
|
700
|
-
return
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
].join("\n");
|
|
969
|
+
return `|<|system|>
|
|
970
|
+
You are a CSS selector repair tool. Given a broken selector and the current DOM, output the new CSS selector that targets the same semantic element.
|
|
971
|
+
|
|
972
|
+
Output JSON: {"selector": "...", "confidence": 0.0-1.0, "reasoning": "..."}
|
|
973
|
+
|<|user|>
|
|
974
|
+
Broken selector: ${brokenSelector}
|
|
975
|
+
Semantic intent: ${semanticIntent}
|
|
976
|
+
|
|
977
|
+
Current DOM (truncated):
|
|
978
|
+
${this._truncate(domHTML, 2e4)}
|
|
979
|
+
|
|
980
|
+
New selector:
|
|
981
|
+
|<|assistant|>
|
|
982
|
+
`;
|
|
715
983
|
}
|
|
716
984
|
_truncate(text, maxChars) {
|
|
717
985
|
if (text.length <= maxChars) return text;
|
|
@@ -726,9 +994,8 @@ var LocalAXIRCompiler = class {
|
|
|
726
994
|
};
|
|
727
995
|
export {
|
|
728
996
|
AWIClient,
|
|
729
|
-
|
|
997
|
+
AXIRCompiler,
|
|
730
998
|
AdvisoryExecutor,
|
|
731
|
-
LocalAXIRCompiler
|
|
732
|
-
client_default as default
|
|
999
|
+
LocalAXIRCompiler
|
|
733
1000
|
};
|
|
734
1001
|
//# sourceMappingURL=index.mjs.map
|