@awi-protocol/sdk 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +36 -8
- package/dist/index.d.ts +36 -8
- package/dist/index.js +365 -91
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +364 -97
- package/dist/index.mjs.map +1 -1
- package/package.json +29 -50
- package/src/compiler/grammar/axir-schema.gbnf +97 -0
package/dist/index.js
CHANGED
|
@@ -31,10 +31,9 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
|
|
|
31
31
|
var index_exports = {};
|
|
32
32
|
__export(index_exports, {
|
|
33
33
|
AWIClient: () => AWIClient,
|
|
34
|
-
|
|
34
|
+
AXIRCompiler: () => AXIRCompiler,
|
|
35
35
|
AdvisoryExecutor: () => AdvisoryExecutor,
|
|
36
|
-
LocalAXIRCompiler: () => LocalAXIRCompiler
|
|
37
|
-
default: () => client_default
|
|
36
|
+
LocalAXIRCompiler: () => LocalAXIRCompiler
|
|
38
37
|
});
|
|
39
38
|
module.exports = __toCommonJS(index_exports);
|
|
40
39
|
|
|
@@ -242,7 +241,6 @@ var AWIClient = class {
|
|
|
242
241
|
throw lastError || new AWIError("MAX_RETRIES", "Max retries exceeded", 502);
|
|
243
242
|
}
|
|
244
243
|
};
|
|
245
|
-
var client_default = AWIClient;
|
|
246
244
|
|
|
247
245
|
// src/advisory-executor.ts
|
|
248
246
|
var AdvisoryExecutor = class {
|
|
@@ -473,27 +471,312 @@ var AdvisoryExecutor = class {
|
|
|
473
471
|
}
|
|
474
472
|
};
|
|
475
473
|
|
|
474
|
+
// src/compiler/axir-compiler.ts
|
|
475
|
+
var cheerio = __toESM(require("cheerio"));
|
|
476
|
+
var AXIRCompiler = class {
|
|
477
|
+
$;
|
|
478
|
+
intent;
|
|
479
|
+
params;
|
|
480
|
+
domain;
|
|
481
|
+
constructor(html, options) {
|
|
482
|
+
this.$ = cheerio.load(html);
|
|
483
|
+
this.intent = options.intent;
|
|
484
|
+
this.params = options.params || {};
|
|
485
|
+
this.domain = options.domain || "unknown";
|
|
486
|
+
}
|
|
487
|
+
compile() {
|
|
488
|
+
const start = Date.now();
|
|
489
|
+
this.simplifyDOM();
|
|
490
|
+
const regions = this.identifyRegions();
|
|
491
|
+
const target = this.routeIntent(regions);
|
|
492
|
+
return {
|
|
493
|
+
workflow: this.buildWorkflow(target, regions),
|
|
494
|
+
intents: this.mapIntents(),
|
|
495
|
+
selectors: this.generateSelectors(target),
|
|
496
|
+
fields: this.generateFields(target),
|
|
497
|
+
container: target.container,
|
|
498
|
+
model_used: "axir-deterministic-v1",
|
|
499
|
+
tokens_used: 0,
|
|
500
|
+
compilation_time_ms: Date.now() - start
|
|
501
|
+
};
|
|
502
|
+
}
|
|
503
|
+
simplifyDOM() {
|
|
504
|
+
this.$("script, style, svg, noscript, iframe, canvas, video, audio").remove();
|
|
505
|
+
for (const el of this.$("div, span").toArray()) {
|
|
506
|
+
const $el = this.$(el);
|
|
507
|
+
if ($el.children().length === 0 && $el.text().trim() === "") $el.remove();
|
|
508
|
+
}
|
|
509
|
+
this.$('[style*="display:none"], [style*="display: none"], [hidden], [aria-hidden="true"]').remove();
|
|
510
|
+
}
|
|
511
|
+
identifyRegions() {
|
|
512
|
+
const regions = [];
|
|
513
|
+
for (const el of this.$('form, [role="search"], input[type="search"]').toArray()) {
|
|
514
|
+
const r = this.analyzeSearchRegion(this.$(el));
|
|
515
|
+
if (r) regions.push(r);
|
|
516
|
+
}
|
|
517
|
+
for (const el of this.$('nav, [role="navigation"], header, .nav, .navbar, .menu').toArray()) {
|
|
518
|
+
regions.push({ type: "navigation", element: this.$(el), confidence: 0.9 });
|
|
519
|
+
}
|
|
520
|
+
for (const el of this.$('ul, ol, [role="list"], .list, .results, .items, table, [role="grid"]').toArray()) {
|
|
521
|
+
const $el = this.$(el);
|
|
522
|
+
if ($el.find('li, tr, .item, [role="listitem"]').length > 1) {
|
|
523
|
+
regions.push({ type: "listing", element: $el, confidence: 0.85 });
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
for (const el of this.$("form").toArray()) {
|
|
527
|
+
const $el = this.$(el);
|
|
528
|
+
if (!regions.some((r) => r.element.is($el))) regions.push({ type: "form", element: $el, confidence: 0.9 });
|
|
529
|
+
}
|
|
530
|
+
for (const el of this.$(".pagination, .pager, .pages").toArray()) {
|
|
531
|
+
if (this.isPagination(this.$(el))) regions.push({ type: "pagination", element: this.$(el), confidence: 0.8 });
|
|
532
|
+
}
|
|
533
|
+
for (const el of this.$('article, [role="article"], .content, .main, main, .detail').toArray()) {
|
|
534
|
+
regions.push({ type: "detail", element: this.$(el), confidence: 0.75 });
|
|
535
|
+
}
|
|
536
|
+
return regions;
|
|
537
|
+
}
|
|
538
|
+
analyzeSearchRegion($el) {
|
|
539
|
+
const hasInput = $el.find('input[type="text"], input[type="search"], input:not([type])').length > 0;
|
|
540
|
+
const hasButton = $el.find('button, input[type="submit"]').length > 0;
|
|
541
|
+
if (hasInput || hasButton) return { type: "search", element: $el, confidence: hasInput && hasButton ? 0.95 : 0.7 };
|
|
542
|
+
return null;
|
|
543
|
+
}
|
|
544
|
+
isPagination($el) {
|
|
545
|
+
const text = $el.text().toLowerCase();
|
|
546
|
+
return /\d+/.test(text) && (/next|>|\u203a|\u2192|\u00bb/.test(text) || /prev|previous|<|\u2039|\u2190|\u00ab/.test(text));
|
|
547
|
+
}
|
|
548
|
+
routeIntent(regions) {
|
|
549
|
+
const intentMap = {
|
|
550
|
+
search: ["search", "form"],
|
|
551
|
+
search_jobs: ["search", "listing", "form"],
|
|
552
|
+
extract_list: ["listing", "search", "detail"],
|
|
553
|
+
extract_detail: ["detail", "listing"],
|
|
554
|
+
fill_form: ["form", "search"],
|
|
555
|
+
navigate: ["navigation", "listing"],
|
|
556
|
+
login: ["form"],
|
|
557
|
+
filter: ["search", "listing"],
|
|
558
|
+
sort: ["listing", "search"],
|
|
559
|
+
scrape: ["listing", "detail", "search"]
|
|
560
|
+
};
|
|
561
|
+
const targetTypes = intentMap[this.intent.toLowerCase()] || ["search", "listing", "form"];
|
|
562
|
+
let best = null;
|
|
563
|
+
let bestScore = 0;
|
|
564
|
+
for (const r of regions) {
|
|
565
|
+
const match = targetTypes.indexOf(r.type);
|
|
566
|
+
const score = match >= 0 ? (targetTypes.length - match) * r.confidence : 0;
|
|
567
|
+
if (score > bestScore) {
|
|
568
|
+
bestScore = score;
|
|
569
|
+
best = r;
|
|
570
|
+
}
|
|
571
|
+
}
|
|
572
|
+
if (!best) best = this.findLargestRegion(regions);
|
|
573
|
+
return { region: best, container: this.generateContainerSelector(best.element) };
|
|
574
|
+
}
|
|
575
|
+
findLargestRegion(regions) {
|
|
576
|
+
if (regions.length === 0) return { type: "unknown", element: this.$("body"), confidence: 0.5 };
|
|
577
|
+
return regions.reduce((l, c) => c.element.find("*").length > l.element.find("*").length ? c : l);
|
|
578
|
+
}
|
|
579
|
+
buildWorkflow(_target, all) {
|
|
580
|
+
const nodes = {};
|
|
581
|
+
const edges = [];
|
|
582
|
+
const entry = [];
|
|
583
|
+
const exit = [];
|
|
584
|
+
all.forEach((r, i) => {
|
|
585
|
+
const id = `${r.type}_${i}`;
|
|
586
|
+
const raw = r.element[0];
|
|
587
|
+
nodes[id] = {
|
|
588
|
+
node_id: id,
|
|
589
|
+
element_type: this.mapType(r.type),
|
|
590
|
+
semantic_role: r.type,
|
|
591
|
+
intent: this.intent,
|
|
592
|
+
tag: raw?.tagName?.toLowerCase(),
|
|
593
|
+
selector_candidates: this.buildCandidates(r.element),
|
|
594
|
+
confidence: r.confidence
|
|
595
|
+
};
|
|
596
|
+
if (r.type === "navigation") entry.push(id);
|
|
597
|
+
if (r.type === "listing" || r.type === "detail") exit.push(id);
|
|
598
|
+
});
|
|
599
|
+
all.forEach((f, fi) => all.forEach((t, ti) => {
|
|
600
|
+
if (fi !== ti) {
|
|
601
|
+
const e = this.inferEdge(f, t, fi, ti);
|
|
602
|
+
if (e) edges.push(e);
|
|
603
|
+
}
|
|
604
|
+
}));
|
|
605
|
+
if (entry.length === 0 && Object.keys(nodes).length > 0) entry.push(Object.keys(nodes)[0]);
|
|
606
|
+
return { nodes, edges, entry_points: entry, exit_points: exit, domain: this.domain, page_type: this.inferPageType(all) };
|
|
607
|
+
}
|
|
608
|
+
mapType(t) {
|
|
609
|
+
const m = {
|
|
610
|
+
search: "search",
|
|
611
|
+
navigation: "navigation",
|
|
612
|
+
listing: "list",
|
|
613
|
+
form: "form",
|
|
614
|
+
pagination: "pagination",
|
|
615
|
+
detail: "container"
|
|
616
|
+
};
|
|
617
|
+
return m[t] || "unknown";
|
|
618
|
+
}
|
|
619
|
+
inferEdge(f, t, fi, ti) {
|
|
620
|
+
if (f.type === "search" && t.type === "listing") return { from_node: `search_${fi}`, to_node: `listing_${ti}`, action: "submit_search", probability: 0.9 };
|
|
621
|
+
if (f.type === "navigation" && t.type === "search") return { from_node: `navigation_${fi}`, to_node: `search_${ti}`, action: "navigate_to_search", probability: 0.7 };
|
|
622
|
+
if (f.type === "listing" && t.type === "pagination") return { from_node: `listing_${fi}`, to_node: `pagination_${ti}`, action: "next_page", probability: 0.8 };
|
|
623
|
+
if (f.type === "pagination" && t.type === "listing") return { from_node: `pagination_${fi}`, to_node: `listing_${ti}`, action: "load_results", probability: 0.95 };
|
|
624
|
+
return null;
|
|
625
|
+
}
|
|
626
|
+
inferPageType(regions) {
|
|
627
|
+
const t = regions.map((r) => r.type);
|
|
628
|
+
if (t.includes("search") && t.includes("listing")) return "search";
|
|
629
|
+
if (t.includes("listing")) return "listing";
|
|
630
|
+
if (t.includes("form")) return "form";
|
|
631
|
+
if (t.includes("search")) return "search";
|
|
632
|
+
if (t.includes("navigation")) return "landing";
|
|
633
|
+
return "unknown";
|
|
634
|
+
}
|
|
635
|
+
generateSelectors(target) {
|
|
636
|
+
const s = {};
|
|
637
|
+
const $el = target.region.element;
|
|
638
|
+
s.container = this.buildCandidates($el);
|
|
639
|
+
for (const el of $el.find("input, textarea, select").toArray()) {
|
|
640
|
+
const n = this.inferFieldName(this.$(el));
|
|
641
|
+
if (n) s[n] = this.buildCandidates(this.$(el));
|
|
642
|
+
}
|
|
643
|
+
for (const el of $el.find('button, input[type="submit"], input[type="button"]').toArray()) {
|
|
644
|
+
const $btn = this.$(el);
|
|
645
|
+
const label = $btn.text().trim() || String($btn.val() || "button");
|
|
646
|
+
s[`btn_${this.slugify(label)}`] = this.buildCandidates($btn);
|
|
647
|
+
}
|
|
648
|
+
for (const el of $el.find("a").toArray()) {
|
|
649
|
+
const $a = this.$(el);
|
|
650
|
+
const t = $a.text().trim();
|
|
651
|
+
if (t && t.length < 50) s[`link_${this.slugify(t)}`] = this.buildCandidates($a);
|
|
652
|
+
}
|
|
653
|
+
return s;
|
|
654
|
+
}
|
|
655
|
+
buildCandidates($el) {
|
|
656
|
+
const c = [];
|
|
657
|
+
const el = $el[0];
|
|
658
|
+
if (!el) return c;
|
|
659
|
+
const id = $el.attr("id");
|
|
660
|
+
if (id && !id.match(/^\d/)) c.push({ type: "css", value: `#${this.escape(id)}`, priority: 1, confidence: 0.99 });
|
|
661
|
+
const classes = ($el.attr("class") || "").split(/\s+/).filter((x) => x && !x.match(/^js-|^ng-|^vue-|^data-/));
|
|
662
|
+
if (classes.length) c.push({ type: "css", value: `.${classes.map((x) => this.escape(x)).join(".")}`, priority: 2, confidence: 0.85 });
|
|
663
|
+
const raw = el;
|
|
664
|
+
const tag = raw.tagName?.toLowerCase() || "";
|
|
665
|
+
const name = $el.attr("name");
|
|
666
|
+
const type = $el.attr("type");
|
|
667
|
+
const placeholder = $el.attr("placeholder");
|
|
668
|
+
if (name) c.push({ type: "css", value: `${tag}[name="${this.q(name)}"]`, priority: 3, confidence: 0.9 });
|
|
669
|
+
if (type) c.push({ type: "css", value: `${tag}[type="${type}"]`, priority: 4, confidence: 0.8 });
|
|
670
|
+
if (placeholder) c.push({ type: "css", value: `${tag}[placeholder="${this.q(placeholder)}"]`, priority: 5, confidence: 0.75 });
|
|
671
|
+
const role = $el.attr("role");
|
|
672
|
+
if (role) c.push({ type: "semantic", value: `[role="${role}"]`, priority: 6, confidence: 0.9 });
|
|
673
|
+
const al = $el.attr("aria-label");
|
|
674
|
+
if (al) c.push({ type: "semantic", value: `[aria-label="${this.q(al)}"]`, priority: 7, confidence: 0.85 });
|
|
675
|
+
const text = $el.text().trim();
|
|
676
|
+
if (text && text.length < 100) c.push({ type: "text", value: text, priority: 8, confidence: 0.7 });
|
|
677
|
+
return c;
|
|
678
|
+
}
|
|
679
|
+
generateFields(target) {
|
|
680
|
+
const f = [];
|
|
681
|
+
for (const el of target.region.element.find("input, textarea, select").toArray()) {
|
|
682
|
+
const $el = this.$(el);
|
|
683
|
+
const name = this.inferFieldName($el);
|
|
684
|
+
if (!name) continue;
|
|
685
|
+
f.push({
|
|
686
|
+
name,
|
|
687
|
+
selector: this.bestSelector($el),
|
|
688
|
+
transform: this.inferTransform($el),
|
|
689
|
+
required: $el.attr("required") !== void 0
|
|
690
|
+
});
|
|
691
|
+
}
|
|
692
|
+
return f;
|
|
693
|
+
}
|
|
694
|
+
inferFieldName($el) {
|
|
695
|
+
const id = $el.attr("id");
|
|
696
|
+
if (id) {
|
|
697
|
+
const $l = this.$(`label[for="${id}"]`);
|
|
698
|
+
if ($l.length) return this.slugify($l.text());
|
|
699
|
+
}
|
|
700
|
+
const ph = $el.attr("placeholder");
|
|
701
|
+
if (ph) return this.slugify(ph);
|
|
702
|
+
const al = $el.attr("aria-label");
|
|
703
|
+
if (al) return this.slugify(al);
|
|
704
|
+
const n = $el.attr("name");
|
|
705
|
+
if (n) return this.slugify(n);
|
|
706
|
+
return null;
|
|
707
|
+
}
|
|
708
|
+
inferTransform($el) {
|
|
709
|
+
const t = $el.attr("type");
|
|
710
|
+
if (t === "number") return "number";
|
|
711
|
+
if (t === "email") return "email";
|
|
712
|
+
if (t === "date") return "date";
|
|
713
|
+
if (t === "checkbox") return "boolean";
|
|
714
|
+
if ($el.is("select")) return "select";
|
|
715
|
+
return void 0;
|
|
716
|
+
}
|
|
717
|
+
bestSelector($el) {
|
|
718
|
+
const c = this.buildCandidates($el);
|
|
719
|
+
if (c.length) return c[0].value;
|
|
720
|
+
const raw = $el[0];
|
|
721
|
+
return raw?.tagName?.toLowerCase() || "*";
|
|
722
|
+
}
|
|
723
|
+
generateContainerSelector($el) {
|
|
724
|
+
const c = this.buildCandidates($el);
|
|
725
|
+
return c.length ? c[0].value : "body";
|
|
726
|
+
}
|
|
727
|
+
mapIntents() {
|
|
728
|
+
const m = {
|
|
729
|
+
search: { intent: "search", action: "fill_and_submit", parameters: ["query", "location", "filters"], context: "Enter search terms and submit form" },
|
|
730
|
+
search_jobs: { intent: "search_jobs", action: "fill_and_submit", parameters: ["query", "location", "experience_level", "job_type"], context: "Search for job listings with optional filters" },
|
|
731
|
+
extract_list: { intent: "extract_list", action: "extract_fields", parameters: ["items", "title", "url", "metadata"], context: "Extract structured data from list items" },
|
|
732
|
+
extract_detail: { intent: "extract_detail", action: "extract_fields", parameters: ["title", "description", "metadata", "links"], context: "Extract structured data from detail page" },
|
|
733
|
+
fill_form: { intent: "fill_form", action: "fill_and_submit", parameters: Object.keys(this.params), context: "Fill form fields with provided parameters" },
|
|
734
|
+
navigate: { intent: "navigate", action: "click", parameters: ["target_url", "link_text"], context: "Click navigation link to target page" },
|
|
735
|
+
login: { intent: "login", action: "fill_and_submit", parameters: ["username", "password"], context: "Enter credentials and submit login form" },
|
|
736
|
+
scrape: { intent: "scrape", action: "extract_fields", parameters: ["all_visible_text", "links", "images", "structured_data"], context: "Extract all visible content from the page" }
|
|
737
|
+
};
|
|
738
|
+
const mapped = m[this.intent.toLowerCase()];
|
|
739
|
+
if (mapped) return [mapped];
|
|
740
|
+
return [{ intent: this.intent, action: "interact", parameters: Object.keys(this.params), context: `Perform ${this.intent} on the page` }];
|
|
741
|
+
}
|
|
742
|
+
slugify(t) {
|
|
743
|
+
return t.toLowerCase().replace(/[^\w\s-]/g, "").replace(/[\s_-]+/g, "_").replace(/^_|_$/g, "").substring(0, 50);
|
|
744
|
+
}
|
|
745
|
+
escape(s) {
|
|
746
|
+
return s.replace(/([:.])/g, "\\$1");
|
|
747
|
+
}
|
|
748
|
+
q(s) {
|
|
749
|
+
return s.replace(/"/g, '\\"');
|
|
750
|
+
}
|
|
751
|
+
};
|
|
752
|
+
|
|
476
753
|
// src/compiler/local-axir.ts
|
|
477
754
|
var fs = __toESM(require("fs"));
|
|
478
755
|
var path = __toESM(require("path"));
|
|
479
756
|
var os = __toESM(require("os"));
|
|
480
757
|
var https = __toESM(require("https"));
|
|
481
758
|
var http = __toESM(require("http"));
|
|
482
|
-
|
|
483
|
-
// src/compiler/grammar/axir-schema.gbnf
|
|
484
|
-
var axir_schema_default = 'root ::= "{" ws axir-fields ws "}"\n\naxir-fields ::=\n "\\"workflow\\"" ":" workflow ws ","\n ws "\\"intents\\"" ":" intents ws ","\n ws "\\"selectors\\"" ":" selectors ws ","\n ws "\\"fields\\"" ":" fields\n ["," ws "\\"container\\"" ":" string]\n ["," ws "\\"model_used\\"" ":" string]\n ["," ws "\\"tokens_used\\"" ":" number]\n ["," ws "\\"compilation_time_ms\\"" ":" number]\n\nworkflow ::=\n "{" ws\n "\\"nodes\\"" ":" "{" ws node-list ws "}" ws ","\n ws "\\"edges\\"" ":" "[" ws edge-list ws "]" ws ","\n ws "\\"entry_points\\"" ":" "[" ws string-list ws "]" ws ","\n ws "\\"exit_points\\"" ":" "[" ws string-list ws "]" ws ","\n ws "\\"domain\\"" ":" string ws ","\n ws "\\"page_type\\"" ":" page-type\n ["," ws "\\"structure_hash\\"" ":" string]\n ws "}"\n\nnode-list ::= [node-pair ("," ws node-pair)*]\nnode-pair ::= string ":" "{" ws\n "\\"element_type\\"" ":" element-type ws ","\n ws "\\"semantic_role\\"" ":" string ws ","\n ws "\\"intent\\"" ":" string ws ","\n ws "\\"tag\\"" ":" string ws ","\n ws "\\"selector_candidates\\"" ":" "[" ws selector-list ws "]"\n ["," ws "\\"parent_id\\"" ":" string]\n ["," ws "\\"children_ids\\"" ":" "[" ws string-list ws "]"]\n ["," ws "\\"aria_label\\"" ":" string]\n ["," ws "\\"aria_role\\"" ":" string]\n ["," ws "\\"text_content\\"" ":" string]\n "," ws "\\"confidence\\"" ":" number\n ["," ws "\\"reasoning\\"" ":" string]\nws "}"\n\nelement-type ::=\n "\\"button\\"" | "\\"link\\"" | "\\"input\\"" | "\\"form\\"" |\n "\\"navigation\\"" | "\\"search\\"" | "\\"filter\\"" | "\\"sort\\"" |\n "\\"pagination\\"" | "\\"container\\"" | "\\"list\\"" | "\\"item\\"" |\n "\\"heading\\"" | "\\"text\\"" | "\\"image\\"" | "\\"unknown\\""\n\npage-type ::=\n "\\"landing\\"" | "\\"search\\"" | "\\"listing\\"" | "\\"detail\\"" |\n "\\"form\\"" | "\\"checkout\\"" | "\\"dashboard\\"" | "\\"unknown\\""\n\nedge-list ::= [edge ("," ws edge)*]\nedge ::= "{" ws\n "\\"from_node\\"" ":" string ws ","\n ws "\\"to_node\\"" ":" string ws ","\n ws "\\"action\\"" ":" string\n ["," ws "\\"condition\\"" ":" string]\n ["," ws "\\"probability\\"" ":" number]\nws "}"\n\nselector-list ::= [selector ("," ws selector)*]\nselector ::= "{" ws\n "\\"type\\"" ":" selector-type ws ","\n ws "\\"value\\"" ":" string ws ","\n ws "\\"priority\\"" ":" number\n ["," ws "\\"confidence\\"" ":" number]\nws "}"\n\nselector-type ::= "\\"css\\"" | "\\"semantic\\"" | "\\"text\\"" | "\\"attribute\\""\n\nintents ::= "[" ws [intent ("," ws intent)*] ws "]"\nintent ::= "{" ws\n "\\"intent\\"" ":" string ws ","\n ws "\\"action\\"" ":" string ws ","\n ws "\\"parameters\\"" ":" "[" ws [string ("," ws string)*] ws "]" ws ","\n ws "\\"context\\"" ":" string\nws "}"\n\nfields ::= "[" ws [field ("," ws field)*] ws "]"\nfield ::= "{" ws\n "\\"name\\"" ":" string ws ","\n ws "\\"selector\\"" ":" string\n ["," ws "\\"transform\\"" ":" string]\n ["," ws "\\"required\\"" ":" boolean]\nws "}"\n\nselectors ::= "{" ws [selector-pair ("," ws selector-pair)*] ws "}"\nselector-pair ::= string ":" "[" ws selector-list ws "]"\n\nstring-list ::= [string ("," ws string)*]\n\nstring ::= "\\"" char* "\\""\nchar ::= [^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F]{4})\n\nnumber ::= ["-"]? ("0" | [1-9] [0-9]*) ([.] [0-9]+)? ([eE] ["-"]? [0-9]+)?\n\nboolean ::= "true" | "false"\n\nws ::= [ \\t\\n\\r]*\n';
|
|
485
|
-
|
|
486
|
-
// src/compiler/local-axir.ts
|
|
759
|
+
var import_url = require("url");
|
|
487
760
|
var nativeAvailable = false;
|
|
488
761
|
var getLlama;
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
762
|
+
var LlamaModel;
|
|
763
|
+
var LlamaContext;
|
|
764
|
+
var LlamaGrammar;
|
|
765
|
+
var llamaPromise = import("node-llama-cpp").then((m) => {
|
|
492
766
|
nativeAvailable = true;
|
|
493
|
-
|
|
767
|
+
getLlama = m.getLlama;
|
|
768
|
+
LlamaModel = m.LlamaModel;
|
|
769
|
+
LlamaContext = m.LlamaContext;
|
|
770
|
+
LlamaGrammar = m.LlamaGrammar;
|
|
771
|
+
return m;
|
|
772
|
+
}).catch((err) => {
|
|
494
773
|
nativeAvailable = false;
|
|
495
|
-
|
|
496
|
-
|
|
774
|
+
if (process.env.AWI_DEBUG) {
|
|
775
|
+
console.error("[AWI] node-llama-cpp load error:", err.message);
|
|
776
|
+
}
|
|
777
|
+
return null;
|
|
778
|
+
});
|
|
779
|
+
var DEFAULT_MODEL_URL = "https://github.com/RayAKaan/AWI/releases/download/v0.0.0-models/Phi-3-mini-128k-instruct-Q4_K_M.gguf";
|
|
497
780
|
var DEFAULT_MODEL_FILENAME = "phi3-128k-q4.gguf";
|
|
498
781
|
var LocalAXIRCompiler = class {
|
|
499
782
|
modelPath;
|
|
@@ -520,14 +803,10 @@ var LocalAXIRCompiler = class {
|
|
|
520
803
|
this.onDownloadProgress = options.onDownloadProgress;
|
|
521
804
|
this.onStatus = options.onStatus;
|
|
522
805
|
}
|
|
523
|
-
// -------------------------------------------------------------------------
|
|
524
|
-
// Public API
|
|
525
|
-
// -------------------------------------------------------------------------
|
|
526
806
|
async compile(domHTML, a11yTree, intent, params) {
|
|
527
807
|
await this._ensureModel();
|
|
528
808
|
await this._ensureGrammar();
|
|
529
|
-
const
|
|
530
|
-
const prompt = this._buildCompilePrompt(domHTML, a11y, intent, params);
|
|
809
|
+
const prompt = this._buildCompilePrompt(domHTML, a11yTree, intent, params);
|
|
531
810
|
const start = Date.now();
|
|
532
811
|
this._status("Compiling AXIR locally...");
|
|
533
812
|
const resultText = await this._complete(prompt, 4096, 0.3);
|
|
@@ -570,22 +849,26 @@ var LocalAXIRCompiler = class {
|
|
|
570
849
|
this.ready = false;
|
|
571
850
|
}
|
|
572
851
|
}
|
|
573
|
-
// -------------------------------------------------------------------------
|
|
574
|
-
// Model lifecycle
|
|
575
|
-
// -------------------------------------------------------------------------
|
|
576
852
|
async _ensureModel() {
|
|
577
853
|
if (this.ready) return;
|
|
854
|
+
await llamaPromise;
|
|
855
|
+
if (!getLlama) {
|
|
856
|
+
throw new Error("node-llama-cpp failed to load. Is it installed?");
|
|
857
|
+
}
|
|
578
858
|
if (!fs.existsSync(this.modelPath)) {
|
|
579
859
|
await this._downloadModel();
|
|
580
860
|
}
|
|
581
861
|
this._status("Loading local model...");
|
|
582
862
|
const llama = await getLlama();
|
|
583
863
|
const gpuLayers = this.gpuLayers ?? this._autoDetectGPULayers();
|
|
584
|
-
this.model = new
|
|
864
|
+
this.model = new LlamaModel({
|
|
865
|
+
llama,
|
|
585
866
|
modelPath: this.modelPath,
|
|
586
867
|
gpuLayers
|
|
587
868
|
});
|
|
588
|
-
this.context =
|
|
869
|
+
this.context = new LlamaContext({
|
|
870
|
+
llama,
|
|
871
|
+
model: this.model,
|
|
589
872
|
contextSize: this.contextSize
|
|
590
873
|
});
|
|
591
874
|
this.ready = true;
|
|
@@ -593,9 +876,10 @@ var LocalAXIRCompiler = class {
|
|
|
593
876
|
}
|
|
594
877
|
async _ensureGrammar() {
|
|
595
878
|
if (this.grammar) return;
|
|
596
|
-
const
|
|
597
|
-
this.grammar = new
|
|
598
|
-
|
|
879
|
+
const grammarPath = path.join(__dirname, "grammar", "axir-schema.gbnf");
|
|
880
|
+
this.grammar = new LlamaGrammar({
|
|
881
|
+
llama: await getLlama(),
|
|
882
|
+
grammar: fs.readFileSync(grammarPath, "utf-8")
|
|
599
883
|
});
|
|
600
884
|
}
|
|
601
885
|
_autoDetectGPULayers() {
|
|
@@ -604,17 +888,14 @@ var LocalAXIRCompiler = class {
|
|
|
604
888
|
}
|
|
605
889
|
return 0;
|
|
606
890
|
}
|
|
607
|
-
// -------------------------------------------------------------------------
|
|
608
|
-
// Resumable model download
|
|
609
|
-
// -------------------------------------------------------------------------
|
|
610
891
|
async _downloadModel() {
|
|
611
892
|
const dir = path.dirname(this.modelPath);
|
|
612
893
|
if (!fs.existsSync(dir)) {
|
|
613
894
|
fs.mkdirSync(dir, { recursive: true });
|
|
614
895
|
}
|
|
615
896
|
const tempPath = `${this.modelPath}.tmp`;
|
|
616
|
-
const
|
|
617
|
-
const protocol =
|
|
897
|
+
const url = new import_url.URL(this.modelUrl);
|
|
898
|
+
const protocol = url.protocol === "https:" ? https : http;
|
|
618
899
|
let startByte = 0;
|
|
619
900
|
if (fs.existsSync(tempPath)) {
|
|
620
901
|
startByte = fs.statSync(tempPath).size;
|
|
@@ -630,10 +911,10 @@ var LocalAXIRCompiler = class {
|
|
|
630
911
|
headers["Range"] = `bytes=${startByte}-`;
|
|
631
912
|
}
|
|
632
913
|
const request = protocol.get(
|
|
633
|
-
|
|
914
|
+
url,
|
|
634
915
|
{ headers },
|
|
635
916
|
(response) => {
|
|
636
|
-
if (response.statusCode ===
|
|
917
|
+
if (response.statusCode === 302 || response.statusCode === 301) {
|
|
637
918
|
if (response.headers.location) {
|
|
638
919
|
this.modelUrl = response.headers.location;
|
|
639
920
|
return this._downloadModel().then(resolve).catch(reject);
|
|
@@ -641,7 +922,7 @@ var LocalAXIRCompiler = class {
|
|
|
641
922
|
}
|
|
642
923
|
if (response.statusCode !== 200 && response.statusCode !== 206) {
|
|
643
924
|
return reject(
|
|
644
|
-
new Error(`
|
|
925
|
+
new Error(`Download failed: HTTP ${response.statusCode}`)
|
|
645
926
|
);
|
|
646
927
|
}
|
|
647
928
|
const total = parseInt(
|
|
@@ -675,76 +956,69 @@ var LocalAXIRCompiler = class {
|
|
|
675
956
|
});
|
|
676
957
|
});
|
|
677
958
|
}
|
|
678
|
-
// -------------------------------------------------------------------------
|
|
679
|
-
// Inference
|
|
680
|
-
// -------------------------------------------------------------------------
|
|
681
959
|
async _complete(prompt, maxTokens, temperature) {
|
|
682
960
|
if (!this.context) throw new Error("Model not loaded");
|
|
683
961
|
const sequence = this.context.getSequence();
|
|
684
962
|
await sequence.evaluate(prompt);
|
|
685
|
-
const
|
|
963
|
+
const result = await sequence.generateResponse(maxTokens, {
|
|
686
964
|
temperature,
|
|
687
965
|
grammar: this.grammar
|
|
688
966
|
});
|
|
689
967
|
let text = "";
|
|
690
|
-
for await (const token of
|
|
968
|
+
for await (const token of result) {
|
|
691
969
|
text += token;
|
|
692
970
|
}
|
|
693
971
|
return text;
|
|
694
972
|
}
|
|
695
|
-
// -------------------------------------------------------------------------
|
|
696
|
-
// Prompt builders
|
|
697
|
-
// -------------------------------------------------------------------------
|
|
698
973
|
_buildCompilePrompt(domHTML, a11yTree, intent, params) {
|
|
699
974
|
const paramsJson = params ? JSON.stringify(params, null, 2) : "{}";
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
975
|
+
const a11y = a11yTree || "No accessibility tree available.";
|
|
976
|
+
return `|<|system|>
|
|
977
|
+
You are an expert web-scraping analyst. Your job is to read a simplified DOM and accessibility tree, then output a structured JSON object describing the page layout, interactive elements, and data extraction plan.
|
|
978
|
+
|
|
979
|
+
Output MUST be valid JSON matching this schema:
|
|
980
|
+
- workflow.nodes: map of node_id -> {element_type, semantic_role, intent, tag, selector_candidates[], parent_id?, children_ids?, aria_label?, aria_role?, text_content?, confidence, reasoning?}
|
|
981
|
+
- workflow.edges: list of {from_node, to_node, action, condition?, probability}
|
|
982
|
+
- workflow.entry_points: list of starting node_ids
|
|
983
|
+
- workflow.exit_points: list of terminal node_ids
|
|
984
|
+
- workflow.domain: the domain name
|
|
985
|
+
- workflow.page_type: one of landing|search|listing|detail|form|checkout|dashboard|unknown
|
|
986
|
+
- intents: list of {intent, action, parameters[], context}
|
|
987
|
+
- selectors: map of selector_name -> list of {type, value, priority}
|
|
988
|
+
- fields: list of {name, selector, transform?, required}
|
|
989
|
+
- container?: string (optional container selector name)
|
|
990
|
+
|
|
991
|
+
Element types: button, link, input, form, navigation, search, filter, sort, pagination, container, list, item, heading, text, image, unknown.
|
|
992
|
+
Selector types: css, semantic, text, attribute.
|
|
993
|
+
|<|user|>
|
|
994
|
+
Intent: ${intent}
|
|
995
|
+
Parameters: ${paramsJson}
|
|
996
|
+
|
|
997
|
+
Simplified DOM:
|
|
998
|
+
${this._truncate(domHTML, 4e4)}
|
|
999
|
+
|
|
1000
|
+
Accessibility Tree:
|
|
1001
|
+
${this._truncate(a11y, 8e3)}
|
|
1002
|
+
|
|
1003
|
+
Compile AXIR:
|
|
1004
|
+
|<|assistant|>
|
|
1005
|
+
`;
|
|
731
1006
|
}
|
|
732
1007
|
_buildHealPrompt(domHTML, brokenSelector, semanticIntent) {
|
|
733
|
-
return
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
].join("\n");
|
|
1008
|
+
return `|<|system|>
|
|
1009
|
+
You are a CSS selector repair tool. Given a broken selector and the current DOM, output the new CSS selector that targets the same semantic element.
|
|
1010
|
+
|
|
1011
|
+
Output JSON: {"selector": "...", "confidence": 0.0-1.0, "reasoning": "..."}
|
|
1012
|
+
|<|user|>
|
|
1013
|
+
Broken selector: ${brokenSelector}
|
|
1014
|
+
Semantic intent: ${semanticIntent}
|
|
1015
|
+
|
|
1016
|
+
Current DOM (truncated):
|
|
1017
|
+
${this._truncate(domHTML, 2e4)}
|
|
1018
|
+
|
|
1019
|
+
New selector:
|
|
1020
|
+
|<|assistant|>
|
|
1021
|
+
`;
|
|
748
1022
|
}
|
|
749
1023
|
_truncate(text, maxChars) {
|
|
750
1024
|
if (text.length <= maxChars) return text;
|
|
@@ -760,7 +1034,7 @@ var LocalAXIRCompiler = class {
|
|
|
760
1034
|
// Annotate the CommonJS export names for ESM import in node:
|
|
761
1035
|
0 && (module.exports = {
|
|
762
1036
|
AWIClient,
|
|
763
|
-
|
|
1037
|
+
AXIRCompiler,
|
|
764
1038
|
AdvisoryExecutor,
|
|
765
1039
|
LocalAXIRCompiler
|
|
766
1040
|
});
|