@awi-protocol/sdk 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +37 -9
- package/dist/index.d.ts +37 -9
- package/dist/index.js +364 -89
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +363 -95
- package/dist/index.mjs.map +1 -1
- package/package.json +29 -50
- package/src/compiler/grammar/axir-schema.gbnf +97 -0
package/dist/index.js
CHANGED
|
@@ -31,10 +31,9 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
|
|
|
31
31
|
var index_exports = {};
|
|
32
32
|
__export(index_exports, {
|
|
33
33
|
AWIClient: () => AWIClient,
|
|
34
|
-
|
|
34
|
+
AXIRCompiler: () => AXIRCompiler,
|
|
35
35
|
AdvisoryExecutor: () => AdvisoryExecutor,
|
|
36
|
-
LocalAXIRCompiler: () => LocalAXIRCompiler
|
|
37
|
-
default: () => client_default
|
|
36
|
+
LocalAXIRCompiler: () => LocalAXIRCompiler
|
|
38
37
|
});
|
|
39
38
|
module.exports = __toCommonJS(index_exports);
|
|
40
39
|
|
|
@@ -242,7 +241,6 @@ var AWIClient = class {
|
|
|
242
241
|
throw lastError || new AWIError("MAX_RETRIES", "Max retries exceeded", 502);
|
|
243
242
|
}
|
|
244
243
|
};
|
|
245
|
-
var client_default = AWIClient;
|
|
246
244
|
|
|
247
245
|
// src/advisory-executor.ts
|
|
248
246
|
var AdvisoryExecutor = class {
|
|
@@ -473,27 +471,312 @@ var AdvisoryExecutor = class {
|
|
|
473
471
|
}
|
|
474
472
|
};
|
|
475
473
|
|
|
474
|
+
// src/compiler/axir-compiler.ts
|
|
475
|
+
var cheerio = __toESM(require("cheerio"));
|
|
476
|
+
var AXIRCompiler = class {
|
|
477
|
+
$;
|
|
478
|
+
intent;
|
|
479
|
+
params;
|
|
480
|
+
domain;
|
|
481
|
+
constructor(html, options) {
|
|
482
|
+
this.$ = cheerio.load(html);
|
|
483
|
+
this.intent = options.intent;
|
|
484
|
+
this.params = options.params || {};
|
|
485
|
+
this.domain = options.domain || "unknown";
|
|
486
|
+
}
|
|
487
|
+
compile() {
|
|
488
|
+
const start = Date.now();
|
|
489
|
+
this.simplifyDOM();
|
|
490
|
+
const regions = this.identifyRegions();
|
|
491
|
+
const target = this.routeIntent(regions);
|
|
492
|
+
return {
|
|
493
|
+
workflow: this.buildWorkflow(target, regions),
|
|
494
|
+
intents: this.mapIntents(),
|
|
495
|
+
selectors: this.generateSelectors(target),
|
|
496
|
+
fields: this.generateFields(target),
|
|
497
|
+
container: target.container,
|
|
498
|
+
model_used: "axir-deterministic-v1",
|
|
499
|
+
tokens_used: 0,
|
|
500
|
+
compilation_time_ms: Date.now() - start
|
|
501
|
+
};
|
|
502
|
+
}
|
|
503
|
+
simplifyDOM() {
|
|
504
|
+
this.$("script, style, svg, noscript, iframe, canvas, video, audio").remove();
|
|
505
|
+
for (const el of this.$("div, span").toArray()) {
|
|
506
|
+
const $el = this.$(el);
|
|
507
|
+
if ($el.children().length === 0 && $el.text().trim() === "") $el.remove();
|
|
508
|
+
}
|
|
509
|
+
this.$('[style*="display:none"], [style*="display: none"], [hidden], [aria-hidden="true"]').remove();
|
|
510
|
+
}
|
|
511
|
+
identifyRegions() {
|
|
512
|
+
const regions = [];
|
|
513
|
+
for (const el of this.$('form, [role="search"], input[type="search"]').toArray()) {
|
|
514
|
+
const r = this.analyzeSearchRegion(this.$(el));
|
|
515
|
+
if (r) regions.push(r);
|
|
516
|
+
}
|
|
517
|
+
for (const el of this.$('nav, [role="navigation"], header, .nav, .navbar, .menu').toArray()) {
|
|
518
|
+
regions.push({ type: "navigation", element: this.$(el), confidence: 0.9 });
|
|
519
|
+
}
|
|
520
|
+
for (const el of this.$('ul, ol, [role="list"], .list, .results, .items, table, [role="grid"]').toArray()) {
|
|
521
|
+
const $el = this.$(el);
|
|
522
|
+
if ($el.find('li, tr, .item, [role="listitem"]').length > 1) {
|
|
523
|
+
regions.push({ type: "listing", element: $el, confidence: 0.85 });
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
for (const el of this.$("form").toArray()) {
|
|
527
|
+
const $el = this.$(el);
|
|
528
|
+
if (!regions.some((r) => r.element.is($el))) regions.push({ type: "form", element: $el, confidence: 0.9 });
|
|
529
|
+
}
|
|
530
|
+
for (const el of this.$(".pagination, .pager, .pages").toArray()) {
|
|
531
|
+
if (this.isPagination(this.$(el))) regions.push({ type: "pagination", element: this.$(el), confidence: 0.8 });
|
|
532
|
+
}
|
|
533
|
+
for (const el of this.$('article, [role="article"], .content, .main, main, .detail').toArray()) {
|
|
534
|
+
regions.push({ type: "detail", element: this.$(el), confidence: 0.75 });
|
|
535
|
+
}
|
|
536
|
+
return regions;
|
|
537
|
+
}
|
|
538
|
+
analyzeSearchRegion($el) {
|
|
539
|
+
const hasInput = $el.find('input[type="text"], input[type="search"], input:not([type])').length > 0;
|
|
540
|
+
const hasButton = $el.find('button, input[type="submit"]').length > 0;
|
|
541
|
+
if (hasInput || hasButton) return { type: "search", element: $el, confidence: hasInput && hasButton ? 0.95 : 0.7 };
|
|
542
|
+
return null;
|
|
543
|
+
}
|
|
544
|
+
isPagination($el) {
|
|
545
|
+
const text = $el.text().toLowerCase();
|
|
546
|
+
return /\d+/.test(text) && (/next|>|\u203a|\u2192|\u00bb/.test(text) || /prev|previous|<|\u2039|\u2190|\u00ab/.test(text));
|
|
547
|
+
}
|
|
548
|
+
routeIntent(regions) {
|
|
549
|
+
const intentMap = {
|
|
550
|
+
search: ["search", "form"],
|
|
551
|
+
search_jobs: ["search", "listing", "form"],
|
|
552
|
+
extract_list: ["listing", "search", "detail"],
|
|
553
|
+
extract_detail: ["detail", "listing"],
|
|
554
|
+
fill_form: ["form", "search"],
|
|
555
|
+
navigate: ["navigation", "listing"],
|
|
556
|
+
login: ["form"],
|
|
557
|
+
filter: ["search", "listing"],
|
|
558
|
+
sort: ["listing", "search"],
|
|
559
|
+
scrape: ["listing", "detail", "search"]
|
|
560
|
+
};
|
|
561
|
+
const targetTypes = intentMap[this.intent.toLowerCase()] || ["search", "listing", "form"];
|
|
562
|
+
let best = null;
|
|
563
|
+
let bestScore = 0;
|
|
564
|
+
for (const r of regions) {
|
|
565
|
+
const match = targetTypes.indexOf(r.type);
|
|
566
|
+
const score = match >= 0 ? (targetTypes.length - match) * r.confidence : 0;
|
|
567
|
+
if (score > bestScore) {
|
|
568
|
+
bestScore = score;
|
|
569
|
+
best = r;
|
|
570
|
+
}
|
|
571
|
+
}
|
|
572
|
+
if (!best) best = this.findLargestRegion(regions);
|
|
573
|
+
return { region: best, container: this.generateContainerSelector(best.element) };
|
|
574
|
+
}
|
|
575
|
+
findLargestRegion(regions) {
|
|
576
|
+
if (regions.length === 0) return { type: "unknown", element: this.$("body"), confidence: 0.5 };
|
|
577
|
+
return regions.reduce((l, c) => c.element.find("*").length > l.element.find("*").length ? c : l);
|
|
578
|
+
}
|
|
579
|
+
buildWorkflow(_target, all) {
|
|
580
|
+
const nodes = {};
|
|
581
|
+
const edges = [];
|
|
582
|
+
const entry = [];
|
|
583
|
+
const exit = [];
|
|
584
|
+
all.forEach((r, i) => {
|
|
585
|
+
const id = `${r.type}_${i}`;
|
|
586
|
+
const raw = r.element[0];
|
|
587
|
+
nodes[id] = {
|
|
588
|
+
node_id: id,
|
|
589
|
+
element_type: this.mapType(r.type),
|
|
590
|
+
semantic_role: r.type,
|
|
591
|
+
intent: this.intent,
|
|
592
|
+
tag: raw?.tagName?.toLowerCase(),
|
|
593
|
+
selector_candidates: this.buildCandidates(r.element),
|
|
594
|
+
confidence: r.confidence
|
|
595
|
+
};
|
|
596
|
+
if (r.type === "navigation") entry.push(id);
|
|
597
|
+
if (r.type === "listing" || r.type === "detail") exit.push(id);
|
|
598
|
+
});
|
|
599
|
+
all.forEach((f, fi) => all.forEach((t, ti) => {
|
|
600
|
+
if (fi !== ti) {
|
|
601
|
+
const e = this.inferEdge(f, t, fi, ti);
|
|
602
|
+
if (e) edges.push(e);
|
|
603
|
+
}
|
|
604
|
+
}));
|
|
605
|
+
if (entry.length === 0 && Object.keys(nodes).length > 0) entry.push(Object.keys(nodes)[0]);
|
|
606
|
+
return { nodes, edges, entry_points: entry, exit_points: exit, domain: this.domain, page_type: this.inferPageType(all) };
|
|
607
|
+
}
|
|
608
|
+
mapType(t) {
|
|
609
|
+
const m = {
|
|
610
|
+
search: "search",
|
|
611
|
+
navigation: "navigation",
|
|
612
|
+
listing: "list",
|
|
613
|
+
form: "form",
|
|
614
|
+
pagination: "pagination",
|
|
615
|
+
detail: "container"
|
|
616
|
+
};
|
|
617
|
+
return m[t] || "unknown";
|
|
618
|
+
}
|
|
619
|
+
inferEdge(f, t, fi, ti) {
|
|
620
|
+
if (f.type === "search" && t.type === "listing") return { from_node: `search_${fi}`, to_node: `listing_${ti}`, action: "submit_search", probability: 0.9 };
|
|
621
|
+
if (f.type === "navigation" && t.type === "search") return { from_node: `navigation_${fi}`, to_node: `search_${ti}`, action: "navigate_to_search", probability: 0.7 };
|
|
622
|
+
if (f.type === "listing" && t.type === "pagination") return { from_node: `listing_${fi}`, to_node: `pagination_${ti}`, action: "next_page", probability: 0.8 };
|
|
623
|
+
if (f.type === "pagination" && t.type === "listing") return { from_node: `pagination_${fi}`, to_node: `listing_${ti}`, action: "load_results", probability: 0.95 };
|
|
624
|
+
return null;
|
|
625
|
+
}
|
|
626
|
+
inferPageType(regions) {
|
|
627
|
+
const t = regions.map((r) => r.type);
|
|
628
|
+
if (t.includes("search") && t.includes("listing")) return "search";
|
|
629
|
+
if (t.includes("listing")) return "listing";
|
|
630
|
+
if (t.includes("form")) return "form";
|
|
631
|
+
if (t.includes("search")) return "search";
|
|
632
|
+
if (t.includes("navigation")) return "landing";
|
|
633
|
+
return "unknown";
|
|
634
|
+
}
|
|
635
|
+
generateSelectors(target) {
|
|
636
|
+
const s = {};
|
|
637
|
+
const $el = target.region.element;
|
|
638
|
+
s.container = this.buildCandidates($el);
|
|
639
|
+
for (const el of $el.find("input, textarea, select").toArray()) {
|
|
640
|
+
const n = this.inferFieldName(this.$(el));
|
|
641
|
+
if (n) s[n] = this.buildCandidates(this.$(el));
|
|
642
|
+
}
|
|
643
|
+
for (const el of $el.find('button, input[type="submit"], input[type="button"]').toArray()) {
|
|
644
|
+
const $btn = this.$(el);
|
|
645
|
+
const label = $btn.text().trim() || String($btn.val() || "button");
|
|
646
|
+
s[`btn_${this.slugify(label)}`] = this.buildCandidates($btn);
|
|
647
|
+
}
|
|
648
|
+
for (const el of $el.find("a").toArray()) {
|
|
649
|
+
const $a = this.$(el);
|
|
650
|
+
const t = $a.text().trim();
|
|
651
|
+
if (t && t.length < 50) s[`link_${this.slugify(t)}`] = this.buildCandidates($a);
|
|
652
|
+
}
|
|
653
|
+
return s;
|
|
654
|
+
}
|
|
655
|
+
buildCandidates($el) {
|
|
656
|
+
const c = [];
|
|
657
|
+
const el = $el[0];
|
|
658
|
+
if (!el) return c;
|
|
659
|
+
const id = $el.attr("id");
|
|
660
|
+
if (id && !id.match(/^\d/)) c.push({ type: "css", value: `#${this.escape(id)}`, priority: 1, confidence: 0.99 });
|
|
661
|
+
const classes = ($el.attr("class") || "").split(/\s+/).filter((x) => x && !x.match(/^js-|^ng-|^vue-|^data-/));
|
|
662
|
+
if (classes.length) c.push({ type: "css", value: `.${classes.map((x) => this.escape(x)).join(".")}`, priority: 2, confidence: 0.85 });
|
|
663
|
+
const raw = el;
|
|
664
|
+
const tag = raw.tagName?.toLowerCase() || "";
|
|
665
|
+
const name = $el.attr("name");
|
|
666
|
+
const type = $el.attr("type");
|
|
667
|
+
const placeholder = $el.attr("placeholder");
|
|
668
|
+
if (name) c.push({ type: "css", value: `${tag}[name="${this.q(name)}"]`, priority: 3, confidence: 0.9 });
|
|
669
|
+
if (type) c.push({ type: "css", value: `${tag}[type="${type}"]`, priority: 4, confidence: 0.8 });
|
|
670
|
+
if (placeholder) c.push({ type: "css", value: `${tag}[placeholder="${this.q(placeholder)}"]`, priority: 5, confidence: 0.75 });
|
|
671
|
+
const role = $el.attr("role");
|
|
672
|
+
if (role) c.push({ type: "semantic", value: `[role="${role}"]`, priority: 6, confidence: 0.9 });
|
|
673
|
+
const al = $el.attr("aria-label");
|
|
674
|
+
if (al) c.push({ type: "semantic", value: `[aria-label="${this.q(al)}"]`, priority: 7, confidence: 0.85 });
|
|
675
|
+
const text = $el.text().trim();
|
|
676
|
+
if (text && text.length < 100) c.push({ type: "text", value: text, priority: 8, confidence: 0.7 });
|
|
677
|
+
return c;
|
|
678
|
+
}
|
|
679
|
+
generateFields(target) {
|
|
680
|
+
const f = [];
|
|
681
|
+
for (const el of target.region.element.find("input, textarea, select").toArray()) {
|
|
682
|
+
const $el = this.$(el);
|
|
683
|
+
const name = this.inferFieldName($el);
|
|
684
|
+
if (!name) continue;
|
|
685
|
+
f.push({
|
|
686
|
+
name,
|
|
687
|
+
selector: this.bestSelector($el),
|
|
688
|
+
transform: this.inferTransform($el),
|
|
689
|
+
required: $el.attr("required") !== void 0
|
|
690
|
+
});
|
|
691
|
+
}
|
|
692
|
+
return f;
|
|
693
|
+
}
|
|
694
|
+
inferFieldName($el) {
|
|
695
|
+
const id = $el.attr("id");
|
|
696
|
+
if (id) {
|
|
697
|
+
const $l = this.$(`label[for="${id}"]`);
|
|
698
|
+
if ($l.length) return this.slugify($l.text());
|
|
699
|
+
}
|
|
700
|
+
const ph = $el.attr("placeholder");
|
|
701
|
+
if (ph) return this.slugify(ph);
|
|
702
|
+
const al = $el.attr("aria-label");
|
|
703
|
+
if (al) return this.slugify(al);
|
|
704
|
+
const n = $el.attr("name");
|
|
705
|
+
if (n) return this.slugify(n);
|
|
706
|
+
return null;
|
|
707
|
+
}
|
|
708
|
+
inferTransform($el) {
|
|
709
|
+
const t = $el.attr("type");
|
|
710
|
+
if (t === "number") return "number";
|
|
711
|
+
if (t === "email") return "email";
|
|
712
|
+
if (t === "date") return "date";
|
|
713
|
+
if (t === "checkbox") return "boolean";
|
|
714
|
+
if ($el.is("select")) return "select";
|
|
715
|
+
return void 0;
|
|
716
|
+
}
|
|
717
|
+
bestSelector($el) {
|
|
718
|
+
const c = this.buildCandidates($el);
|
|
719
|
+
if (c.length) return c[0].value;
|
|
720
|
+
const raw = $el[0];
|
|
721
|
+
return raw?.tagName?.toLowerCase() || "*";
|
|
722
|
+
}
|
|
723
|
+
generateContainerSelector($el) {
|
|
724
|
+
const c = this.buildCandidates($el);
|
|
725
|
+
return c.length ? c[0].value : "body";
|
|
726
|
+
}
|
|
727
|
+
mapIntents() {
|
|
728
|
+
const m = {
|
|
729
|
+
search: { intent: "search", action: "fill_and_submit", parameters: ["query", "location", "filters"], context: "Enter search terms and submit form" },
|
|
730
|
+
search_jobs: { intent: "search_jobs", action: "fill_and_submit", parameters: ["query", "location", "experience_level", "job_type"], context: "Search for job listings with optional filters" },
|
|
731
|
+
extract_list: { intent: "extract_list", action: "extract_fields", parameters: ["items", "title", "url", "metadata"], context: "Extract structured data from list items" },
|
|
732
|
+
extract_detail: { intent: "extract_detail", action: "extract_fields", parameters: ["title", "description", "metadata", "links"], context: "Extract structured data from detail page" },
|
|
733
|
+
fill_form: { intent: "fill_form", action: "fill_and_submit", parameters: Object.keys(this.params), context: "Fill form fields with provided parameters" },
|
|
734
|
+
navigate: { intent: "navigate", action: "click", parameters: ["target_url", "link_text"], context: "Click navigation link to target page" },
|
|
735
|
+
login: { intent: "login", action: "fill_and_submit", parameters: ["username", "password"], context: "Enter credentials and submit login form" },
|
|
736
|
+
scrape: { intent: "scrape", action: "extract_fields", parameters: ["all_visible_text", "links", "images", "structured_data"], context: "Extract all visible content from the page" }
|
|
737
|
+
};
|
|
738
|
+
const mapped = m[this.intent.toLowerCase()];
|
|
739
|
+
if (mapped) return [mapped];
|
|
740
|
+
return [{ intent: this.intent, action: "interact", parameters: Object.keys(this.params), context: `Perform ${this.intent} on the page` }];
|
|
741
|
+
}
|
|
742
|
+
slugify(t) {
|
|
743
|
+
return t.toLowerCase().replace(/[^\w\s-]/g, "").replace(/[\s_-]+/g, "_").replace(/^_|_$/g, "").substring(0, 50);
|
|
744
|
+
}
|
|
745
|
+
escape(s) {
|
|
746
|
+
return s.replace(/([:.])/g, "\\$1");
|
|
747
|
+
}
|
|
748
|
+
q(s) {
|
|
749
|
+
return s.replace(/"/g, '\\"');
|
|
750
|
+
}
|
|
751
|
+
};
|
|
752
|
+
|
|
476
753
|
// src/compiler/local-axir.ts
|
|
477
754
|
var fs = __toESM(require("fs"));
|
|
478
755
|
var path = __toESM(require("path"));
|
|
479
756
|
var os = __toESM(require("os"));
|
|
480
757
|
var https = __toESM(require("https"));
|
|
481
758
|
var http = __toESM(require("http"));
|
|
482
|
-
|
|
483
|
-
// src/compiler/grammar/axir-schema.gbnf
|
|
484
|
-
var axir_schema_default = 'root ::= "{" ws axir-fields ws "}"\n\naxir-fields ::=\n "\\"workflow\\"" ":" workflow ws ","\n ws "\\"intents\\"" ":" intents ws ","\n ws "\\"selectors\\"" ":" selectors ws ","\n ws "\\"fields\\"" ":" fields\n ["," ws "\\"container\\"" ":" string]\n ["," ws "\\"model_used\\"" ":" string]\n ["," ws "\\"tokens_used\\"" ":" number]\n ["," ws "\\"compilation_time_ms\\"" ":" number]\n\nworkflow ::=\n "{" ws\n "\\"nodes\\"" ":" "{" ws node-list ws "}" ws ","\n ws "\\"edges\\"" ":" "[" ws edge-list ws "]" ws ","\n ws "\\"entry_points\\"" ":" "[" ws string-list ws "]" ws ","\n ws "\\"exit_points\\"" ":" "[" ws string-list ws "]" ws ","\n ws "\\"domain\\"" ":" string ws ","\n ws "\\"page_type\\"" ":" page-type\n ["," ws "\\"structure_hash\\"" ":" string]\n ws "}"\n\nnode-list ::= [node-pair ("," ws node-pair)*]\nnode-pair ::= string ":" "{" ws\n "\\"element_type\\"" ":" element-type ws ","\n ws "\\"semantic_role\\"" ":" string ws ","\n ws "\\"intent\\"" ":" string ws ","\n ws "\\"tag\\"" ":" string ws ","\n ws "\\"selector_candidates\\"" ":" "[" ws selector-list ws "]"\n ["," ws "\\"parent_id\\"" ":" string]\n ["," ws "\\"children_ids\\"" ":" "[" ws string-list ws "]"]\n ["," ws "\\"aria_label\\"" ":" string]\n ["," ws "\\"aria_role\\"" ":" string]\n ["," ws "\\"text_content\\"" ":" string]\n "," ws "\\"confidence\\"" ":" number\n ["," ws "\\"reasoning\\"" ":" string]\nws "}"\n\nelement-type ::=\n "\\"button\\"" | "\\"link\\"" | "\\"input\\"" | "\\"form\\"" |\n "\\"navigation\\"" | "\\"search\\"" | "\\"filter\\"" | "\\"sort\\"" |\n "\\"pagination\\"" | "\\"container\\"" | "\\"list\\"" | "\\"item\\"" |\n "\\"heading\\"" | "\\"text\\"" | "\\"image\\"" | "\\"unknown\\""\n\npage-type ::=\n "\\"landing\\"" | "\\"search\\"" | "\\"listing\\"" | "\\"detail\\"" |\n "\\"form\\"" | "\\"checkout\\"" | "\\"dashboard\\"" | "\\"unknown\\""\n\nedge-list ::= [edge ("," ws edge)*]\nedge ::= "{" ws\n "\\"from_node\\"" ":" string ws ","\n ws "\\"to_node\\"" ":" string ws ","\n ws "\\"action\\"" ":" string\n ["," ws "\\"condition\\"" ":" string]\n ["," ws "\\"probability\\"" ":" number]\nws "}"\n\nselector-list ::= [selector ("," ws selector)*]\nselector ::= "{" ws\n "\\"type\\"" ":" selector-type ws ","\n ws "\\"value\\"" ":" string ws ","\n ws "\\"priority\\"" ":" number\n ["," ws "\\"confidence\\"" ":" number]\nws "}"\n\nselector-type ::= "\\"css\\"" | "\\"semantic\\"" | "\\"text\\"" | "\\"attribute\\""\n\nintents ::= "[" ws [intent ("," ws intent)*] ws "]"\nintent ::= "{" ws\n "\\"intent\\"" ":" string ws ","\n ws "\\"action\\"" ":" string ws ","\n ws "\\"parameters\\"" ":" "[" ws [string ("," ws string)*] ws "]" ws ","\n ws "\\"context\\"" ":" string\nws "}"\n\nfields ::= "[" ws [field ("," ws field)*] ws "]"\nfield ::= "{" ws\n "\\"name\\"" ":" string ws ","\n ws "\\"selector\\"" ":" string\n ["," ws "\\"transform\\"" ":" string]\n ["," ws "\\"required\\"" ":" boolean]\nws "}"\n\nselectors ::= "{" ws [selector-pair ("," ws selector-pair)*] ws "}"\nselector-pair ::= string ":" "[" ws selector-list ws "]"\n\nstring-list ::= [string ("," ws string)*]\n\nstring ::= "\\"" char* "\\""\nchar ::= [^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F]{4})\n\nnumber ::= ["-"]? ("0" | [1-9] [0-9]*) ([.] [0-9]+)? ([eE] ["-"]? [0-9]+)?\n\nboolean ::= "true" | "false"\n\nws ::= [ \\t\\n\\r]*\n';
|
|
485
|
-
|
|
486
|
-
// src/compiler/local-axir.ts
|
|
759
|
+
var import_url = require("url");
|
|
487
760
|
var nativeAvailable = false;
|
|
488
761
|
var getLlama;
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
762
|
+
var LlamaModel;
|
|
763
|
+
var LlamaContext;
|
|
764
|
+
var LlamaGrammar;
|
|
765
|
+
var llamaPromise = import("node-llama-cpp").then((m) => {
|
|
492
766
|
nativeAvailable = true;
|
|
493
|
-
|
|
767
|
+
getLlama = m.getLlama;
|
|
768
|
+
LlamaModel = m.LlamaModel;
|
|
769
|
+
LlamaContext = m.LlamaContext;
|
|
770
|
+
LlamaGrammar = m.LlamaGrammar;
|
|
771
|
+
return m;
|
|
772
|
+
}).catch((err) => {
|
|
494
773
|
nativeAvailable = false;
|
|
495
|
-
|
|
496
|
-
|
|
774
|
+
if (process.env.AWI_DEBUG) {
|
|
775
|
+
console.error("[AWI] node-llama-cpp load error:", err.message);
|
|
776
|
+
}
|
|
777
|
+
return null;
|
|
778
|
+
});
|
|
779
|
+
var DEFAULT_MODEL_URL = "https://github.com/RayAKaan/AWI/releases/download/v0.0.0-models/Phi-3-mini-128k-instruct-Q4_K_M.gguf";
|
|
497
780
|
var DEFAULT_MODEL_FILENAME = "phi3-128k-q4.gguf";
|
|
498
781
|
var LocalAXIRCompiler = class {
|
|
499
782
|
modelPath;
|
|
@@ -520,9 +803,6 @@ var LocalAXIRCompiler = class {
|
|
|
520
803
|
this.onDownloadProgress = options.onDownloadProgress;
|
|
521
804
|
this.onStatus = options.onStatus;
|
|
522
805
|
}
|
|
523
|
-
// -------------------------------------------------------------------------
|
|
524
|
-
// Public API
|
|
525
|
-
// -------------------------------------------------------------------------
|
|
526
806
|
async compile(domHTML, a11yTree, intent, params) {
|
|
527
807
|
await this._ensureModel();
|
|
528
808
|
await this._ensureGrammar();
|
|
@@ -569,22 +849,26 @@ var LocalAXIRCompiler = class {
|
|
|
569
849
|
this.ready = false;
|
|
570
850
|
}
|
|
571
851
|
}
|
|
572
|
-
// -------------------------------------------------------------------------
|
|
573
|
-
// Model lifecycle
|
|
574
|
-
// -------------------------------------------------------------------------
|
|
575
852
|
async _ensureModel() {
|
|
576
853
|
if (this.ready) return;
|
|
854
|
+
await llamaPromise;
|
|
855
|
+
if (!getLlama) {
|
|
856
|
+
throw new Error("node-llama-cpp failed to load. Is it installed?");
|
|
857
|
+
}
|
|
577
858
|
if (!fs.existsSync(this.modelPath)) {
|
|
578
859
|
await this._downloadModel();
|
|
579
860
|
}
|
|
580
861
|
this._status("Loading local model...");
|
|
581
862
|
const llama = await getLlama();
|
|
582
863
|
const gpuLayers = this.gpuLayers ?? this._autoDetectGPULayers();
|
|
583
|
-
this.model = new
|
|
864
|
+
this.model = new LlamaModel({
|
|
865
|
+
llama,
|
|
584
866
|
modelPath: this.modelPath,
|
|
585
867
|
gpuLayers
|
|
586
868
|
});
|
|
587
|
-
this.context =
|
|
869
|
+
this.context = new LlamaContext({
|
|
870
|
+
llama,
|
|
871
|
+
model: this.model,
|
|
588
872
|
contextSize: this.contextSize
|
|
589
873
|
});
|
|
590
874
|
this.ready = true;
|
|
@@ -592,9 +876,10 @@ var LocalAXIRCompiler = class {
|
|
|
592
876
|
}
|
|
593
877
|
async _ensureGrammar() {
|
|
594
878
|
if (this.grammar) return;
|
|
595
|
-
const
|
|
596
|
-
this.grammar = new
|
|
597
|
-
|
|
879
|
+
const grammarPath = path.join(__dirname, "grammar", "axir-schema.gbnf");
|
|
880
|
+
this.grammar = new LlamaGrammar({
|
|
881
|
+
llama: await getLlama(),
|
|
882
|
+
grammar: fs.readFileSync(grammarPath, "utf-8")
|
|
598
883
|
});
|
|
599
884
|
}
|
|
600
885
|
_autoDetectGPULayers() {
|
|
@@ -603,17 +888,14 @@ var LocalAXIRCompiler = class {
|
|
|
603
888
|
}
|
|
604
889
|
return 0;
|
|
605
890
|
}
|
|
606
|
-
// -------------------------------------------------------------------------
|
|
607
|
-
// Resumable model download
|
|
608
|
-
// -------------------------------------------------------------------------
|
|
609
891
|
async _downloadModel() {
|
|
610
892
|
const dir = path.dirname(this.modelPath);
|
|
611
893
|
if (!fs.existsSync(dir)) {
|
|
612
894
|
fs.mkdirSync(dir, { recursive: true });
|
|
613
895
|
}
|
|
614
896
|
const tempPath = `${this.modelPath}.tmp`;
|
|
615
|
-
const
|
|
616
|
-
const protocol =
|
|
897
|
+
const url = new import_url.URL(this.modelUrl);
|
|
898
|
+
const protocol = url.protocol === "https:" ? https : http;
|
|
617
899
|
let startByte = 0;
|
|
618
900
|
if (fs.existsSync(tempPath)) {
|
|
619
901
|
startByte = fs.statSync(tempPath).size;
|
|
@@ -629,10 +911,10 @@ var LocalAXIRCompiler = class {
|
|
|
629
911
|
headers["Range"] = `bytes=${startByte}-`;
|
|
630
912
|
}
|
|
631
913
|
const request = protocol.get(
|
|
632
|
-
|
|
914
|
+
url,
|
|
633
915
|
{ headers },
|
|
634
916
|
(response) => {
|
|
635
|
-
if (response.statusCode ===
|
|
917
|
+
if (response.statusCode === 302 || response.statusCode === 301) {
|
|
636
918
|
if (response.headers.location) {
|
|
637
919
|
this.modelUrl = response.headers.location;
|
|
638
920
|
return this._downloadModel().then(resolve).catch(reject);
|
|
@@ -640,7 +922,7 @@ var LocalAXIRCompiler = class {
|
|
|
640
922
|
}
|
|
641
923
|
if (response.statusCode !== 200 && response.statusCode !== 206) {
|
|
642
924
|
return reject(
|
|
643
|
-
new Error(`
|
|
925
|
+
new Error(`Download failed: HTTP ${response.statusCode}`)
|
|
644
926
|
);
|
|
645
927
|
}
|
|
646
928
|
const total = parseInt(
|
|
@@ -674,76 +956,69 @@ var LocalAXIRCompiler = class {
|
|
|
674
956
|
});
|
|
675
957
|
});
|
|
676
958
|
}
|
|
677
|
-
// -------------------------------------------------------------------------
|
|
678
|
-
// Inference
|
|
679
|
-
// -------------------------------------------------------------------------
|
|
680
959
|
async _complete(prompt, maxTokens, temperature) {
|
|
681
960
|
if (!this.context) throw new Error("Model not loaded");
|
|
682
961
|
const sequence = this.context.getSequence();
|
|
683
962
|
await sequence.evaluate(prompt);
|
|
684
|
-
const
|
|
963
|
+
const result = await sequence.generateResponse(maxTokens, {
|
|
685
964
|
temperature,
|
|
686
965
|
grammar: this.grammar
|
|
687
966
|
});
|
|
688
967
|
let text = "";
|
|
689
|
-
for await (const token of
|
|
968
|
+
for await (const token of result) {
|
|
690
969
|
text += token;
|
|
691
970
|
}
|
|
692
971
|
return text;
|
|
693
972
|
}
|
|
694
|
-
// -------------------------------------------------------------------------
|
|
695
|
-
// Prompt builders
|
|
696
|
-
// -------------------------------------------------------------------------
|
|
697
973
|
_buildCompilePrompt(domHTML, a11yTree, intent, params) {
|
|
698
974
|
const paramsJson = params ? JSON.stringify(params, null, 2) : "{}";
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
975
|
+
const a11y = a11yTree || "No accessibility tree available.";
|
|
976
|
+
return `|<|system|>
|
|
977
|
+
You are an expert web-scraping analyst. Your job is to read a simplified DOM and accessibility tree, then output a structured JSON object describing the page layout, interactive elements, and data extraction plan.
|
|
978
|
+
|
|
979
|
+
Output MUST be valid JSON matching this schema:
|
|
980
|
+
- workflow.nodes: map of node_id -> {element_type, semantic_role, intent, tag, selector_candidates[], parent_id?, children_ids?, aria_label?, aria_role?, text_content?, confidence, reasoning?}
|
|
981
|
+
- workflow.edges: list of {from_node, to_node, action, condition?, probability}
|
|
982
|
+
- workflow.entry_points: list of starting node_ids
|
|
983
|
+
- workflow.exit_points: list of terminal node_ids
|
|
984
|
+
- workflow.domain: the domain name
|
|
985
|
+
- workflow.page_type: one of landing|search|listing|detail|form|checkout|dashboard|unknown
|
|
986
|
+
- intents: list of {intent, action, parameters[], context}
|
|
987
|
+
- selectors: map of selector_name -> list of {type, value, priority}
|
|
988
|
+
- fields: list of {name, selector, transform?, required}
|
|
989
|
+
- container?: string (optional container selector name)
|
|
990
|
+
|
|
991
|
+
Element types: button, link, input, form, navigation, search, filter, sort, pagination, container, list, item, heading, text, image, unknown.
|
|
992
|
+
Selector types: css, semantic, text, attribute.
|
|
993
|
+
|<|user|>
|
|
994
|
+
Intent: ${intent}
|
|
995
|
+
Parameters: ${paramsJson}
|
|
996
|
+
|
|
997
|
+
Simplified DOM:
|
|
998
|
+
${this._truncate(domHTML, 4e4)}
|
|
999
|
+
|
|
1000
|
+
Accessibility Tree:
|
|
1001
|
+
${this._truncate(a11y, 8e3)}
|
|
1002
|
+
|
|
1003
|
+
Compile AXIR:
|
|
1004
|
+
|<|assistant|>
|
|
1005
|
+
`;
|
|
730
1006
|
}
|
|
731
1007
|
_buildHealPrompt(domHTML, brokenSelector, semanticIntent) {
|
|
732
|
-
return
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
].join("\n");
|
|
1008
|
+
return `|<|system|>
|
|
1009
|
+
You are a CSS selector repair tool. Given a broken selector and the current DOM, output the new CSS selector that targets the same semantic element.
|
|
1010
|
+
|
|
1011
|
+
Output JSON: {"selector": "...", "confidence": 0.0-1.0, "reasoning": "..."}
|
|
1012
|
+
|<|user|>
|
|
1013
|
+
Broken selector: ${brokenSelector}
|
|
1014
|
+
Semantic intent: ${semanticIntent}
|
|
1015
|
+
|
|
1016
|
+
Current DOM (truncated):
|
|
1017
|
+
${this._truncate(domHTML, 2e4)}
|
|
1018
|
+
|
|
1019
|
+
New selector:
|
|
1020
|
+
|<|assistant|>
|
|
1021
|
+
`;
|
|
747
1022
|
}
|
|
748
1023
|
_truncate(text, maxChars) {
|
|
749
1024
|
if (text.length <= maxChars) return text;
|
|
@@ -759,7 +1034,7 @@ var LocalAXIRCompiler = class {
|
|
|
759
1034
|
// Annotate the CommonJS export names for ESM import in node:
|
|
760
1035
|
0 && (module.exports = {
|
|
761
1036
|
AWIClient,
|
|
762
|
-
|
|
1037
|
+
AXIRCompiler,
|
|
763
1038
|
AdvisoryExecutor,
|
|
764
1039
|
LocalAXIRCompiler
|
|
765
1040
|
});
|