@intuned/browser-dev 0.1.16-dev.0 → 0.1.17-dev.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/accumulate_llm_cost.py +120 -0
- package/dist/optimized-extractors/common/aiModelsValidations.js +2 -21
- package/dist/optimized-extractors/common/extractStructuredDataUsingClaude.js +8 -15
- package/dist/optimized-extractors/common/extractStructuredDataUsingGoogle.js +1 -4
- package/dist/optimized-extractors/common/extractStructuredDataUsingOpenAi.js +12 -17
- package/dist/optimized-extractors/common/extractStrucutredDataUsingAiInstance.js +7 -16
- package/dist/optimized-extractors/common/findTableHeaders.js +2 -2
- package/dist/optimized-extractors/common/index.js +12 -2
- package/dist/optimized-extractors/common/isTableHeaderOrFooter.js +2 -2
- package/dist/optimized-extractors/common/modelStringSupport.test.js +1 -1
- package/dist/optimized-extractors/common/utils.js +6 -0
- package/dist/optimized-extractors/listExtractionHelpers/runAiExtraction.js +1 -1
- package/dist/optimized-extractors/listExtractionHelpers/utils/extractPropertiesUsingGPTFromArray.js +2 -2
- package/dist/optimized-extractors/validators.js +1 -1
- package/package.json +1 -1
- package/dist/optimized-extractors/types/aiModelsValidation.js +0 -45
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Accumulate total LLM cost (in cents) across all runs in a results JSONL file.
|
|
4
|
+
|
|
5
|
+
For each line in the results JSONL:
|
|
6
|
+
- read the `log_url` field
|
|
7
|
+
- download that logs JSONL
|
|
8
|
+
- find every log line whose message contains "Total LLM Cost In Cents: <number>"
|
|
9
|
+
- sum all those numbers
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
python accumulate_llm_cost.py /path/to/results.jsonl
|
|
13
|
+
python accumulate_llm_cost.py /path/to/results.jsonl --workers 16
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import json
|
|
18
|
+
import re
|
|
19
|
+
import sys
|
|
20
|
+
import urllib.request
|
|
21
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
22
|
+
|
|
23
|
+
# Matches: "Total LLM Cost In Cents: 01.1" -> captures "01.1"
|
|
24
|
+
COST_RE = re.compile(r"Total LLM Cost In Cents:\s*([0-9]*\.?[0-9]+)", re.IGNORECASE)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def download_text(url: str, timeout: int = 120) -> str:
|
|
28
|
+
req = urllib.request.Request(url, headers={"User-Agent": "llm-cost-accumulator"})
|
|
29
|
+
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
30
|
+
return resp.read().decode("utf-8", errors="replace")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def costs_from_log_text(text: str):
|
|
34
|
+
"""Yield every cost value found in a logs JSONL blob."""
|
|
35
|
+
for line in text.splitlines():
|
|
36
|
+
line = line.strip()
|
|
37
|
+
if not line:
|
|
38
|
+
continue
|
|
39
|
+
message = None
|
|
40
|
+
try:
|
|
41
|
+
obj = json.loads(line)
|
|
42
|
+
message = obj.get("message")
|
|
43
|
+
except json.JSONDecodeError:
|
|
44
|
+
# fall back to scanning the raw line
|
|
45
|
+
message = line
|
|
46
|
+
if not message:
|
|
47
|
+
continue
|
|
48
|
+
for m in COST_RE.finditer(str(message)):
|
|
49
|
+
yield float(m.group(1))
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def process_run(record: dict):
|
|
53
|
+
"""Return (label, total_cents, hit_count, error) for one results record."""
|
|
54
|
+
api = record.get("apiInfo", {})
|
|
55
|
+
name = api.get("name", "?")
|
|
56
|
+
run_id = api.get("runId") or record.get("projectJobRun", {}).get("id", "?")
|
|
57
|
+
label = f"{name} / {run_id}"
|
|
58
|
+
|
|
59
|
+
log_url = api.get("log_url")
|
|
60
|
+
if not log_url:
|
|
61
|
+
return label, 0.0, 0, "no log_url"
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
text = download_text(log_url)
|
|
65
|
+
except Exception as e: # noqa: BLE001
|
|
66
|
+
return label, 0.0, 0, f"download failed: {e}"
|
|
67
|
+
|
|
68
|
+
costs = list(costs_from_log_text(text))
|
|
69
|
+
return label, sum(costs), len(costs), None
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def main():
|
|
73
|
+
parser = argparse.ArgumentParser(description=__doc__)
|
|
74
|
+
parser.add_argument("results", help="Path to the results JSONL file")
|
|
75
|
+
parser.add_argument("--workers", type=int, default=8, help="Parallel downloads")
|
|
76
|
+
parser.add_argument("--quiet", action="store_true", help="Only print the grand total")
|
|
77
|
+
args = parser.parse_args()
|
|
78
|
+
|
|
79
|
+
with open(args.results, "r", encoding="utf-8") as f:
|
|
80
|
+
records = []
|
|
81
|
+
for ln, line in enumerate(f, 1):
|
|
82
|
+
line = line.strip()
|
|
83
|
+
if not line:
|
|
84
|
+
continue
|
|
85
|
+
try:
|
|
86
|
+
records.append(json.loads(line))
|
|
87
|
+
except json.JSONDecodeError as e:
|
|
88
|
+
print(f" [warn] skipping malformed line {ln}: {e}", file=sys.stderr)
|
|
89
|
+
|
|
90
|
+
if not args.quiet:
|
|
91
|
+
print(f"Loaded {len(records)} run(s) from {args.results}\n")
|
|
92
|
+
|
|
93
|
+
grand_total = 0.0
|
|
94
|
+
total_hits = 0
|
|
95
|
+
errors = 0
|
|
96
|
+
|
|
97
|
+
with ThreadPoolExecutor(max_workers=args.workers) as pool:
|
|
98
|
+
futures = {pool.submit(process_run, rec): rec for rec in records}
|
|
99
|
+
for fut in as_completed(futures):
|
|
100
|
+
label, total, hits, err = fut.result()
|
|
101
|
+
if err:
|
|
102
|
+
errors += 1
|
|
103
|
+
if not args.quiet:
|
|
104
|
+
print(f" [error] {label}: {err}")
|
|
105
|
+
continue
|
|
106
|
+
grand_total += total
|
|
107
|
+
total_hits += hits
|
|
108
|
+
if not args.quiet:
|
|
109
|
+
print(f" {label}: {total:.4f} cents ({hits} entr{'y' if hits == 1 else 'ies'})")
|
|
110
|
+
|
|
111
|
+
print("\n" + "=" * 60)
|
|
112
|
+
print(f"Runs processed : {len(records)}")
|
|
113
|
+
print(f"Cost entries : {total_hits}")
|
|
114
|
+
print(f"Errors : {errors}")
|
|
115
|
+
print(f"TOTAL LLM COST : {grand_total:.4f} cents (${grand_total / 100:.4f})")
|
|
116
|
+
print("=" * 60)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
if __name__ == "__main__":
|
|
120
|
+
main()
|
|
@@ -3,30 +3,11 @@
|
|
|
3
3
|
Object.defineProperty(exports, "__esModule", {
|
|
4
4
|
value: true
|
|
5
5
|
});
|
|
6
|
-
exports.
|
|
7
|
-
const CLAUDE_MODELS = exports.CLAUDE_MODELS = ["claude-opus-4-20250514", "claude-sonnet-4-20250514", "claude-3-7-sonnet-20250219", "claude-3-5-sonnet-20241022", "claude-3-5-sonnet-20240620", "claude-3-5-haiku-20241022", "claude-3-opus-20240229", "claude-3-sonnet-20240229", "claude-3-haiku-20240307"];
|
|
6
|
+
exports.MAX_TOKENS_OVERRIDES = void 0;
|
|
8
7
|
const MAX_TOKENS_OVERRIDES = exports.MAX_TOKENS_OVERRIDES = {
|
|
9
8
|
"claude-3-5-sonnet-20240620": 8192,
|
|
10
9
|
"gemini-1.5-pro-002": 8192,
|
|
11
10
|
"gemini-1.5-flash-8b-002": 8192,
|
|
12
11
|
"gemini-1.5-flash-002": 8192,
|
|
13
12
|
"gemini-2.0-flash-exp": 8192
|
|
14
|
-
};
|
|
15
|
-
const CLAUDE_VISION_SUPPORTED_MODELS = exports.CLAUDE_VISION_SUPPORTED_MODELS = [...CLAUDE_MODELS];
|
|
16
|
-
const CLAUDE_MODELS_MAPPINGS = exports.CLAUDE_MODELS_MAPPINGS = {
|
|
17
|
-
"claude-3-haiku": "claude-3-haiku-20240307",
|
|
18
|
-
"claude-3-5-haiku": "claude-3-5-haiku-20241022",
|
|
19
|
-
"claude-3-opus": "claude-3-opus-20240229",
|
|
20
|
-
"claude-3-sonnet": "claude-3-sonnet-20240229",
|
|
21
|
-
"claude-3.5-sonnet": "claude-3-5-sonnet-20241022",
|
|
22
|
-
"claude-4-sonnet": "claude-sonnet-4-20250514",
|
|
23
|
-
"claude-4-opus": "claude-opus-4-20250514"
|
|
24
|
-
};
|
|
25
|
-
const GPT_MODELS = exports.GPT_MODELS = ["gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", "gpt-4o", "gpt-4o-mini", "gpt-4o-audio-preview", "gpt-4-turbo", "gpt-4", "gpt-3.5-turbo", "o1", "o1-mini", "o1-preview", "o3-mini", "o3", "o4-mini", "chatgpt-4o-latest", "gpt4-turbo"];
|
|
26
|
-
const GOOGLE_MODELS = exports.GOOGLE_MODELS = ["gemini-2.5-pro", "gemini-2.5-flash", "gemini-2.5-flash-lite", "gemini-2.5-flash-lite-preview-06-17", "gemini-2.0-flash", "gemini-1.5-pro", "gemini-1.5-pro-latest", "gemini-1.5-flash", "gemini-1.5-flash-latest", "gemini-1.5-flash-8b", "gemini-1.5-flash-8b-latest"];
|
|
27
|
-
const MODELS_MAPPINGS = exports.MODELS_MAPPINGS = {
|
|
28
|
-
...CLAUDE_MODELS_MAPPINGS
|
|
29
|
-
};
|
|
30
|
-
const SUPPPORTED_CLAUDE_MODELS = exports.SUPPPORTED_CLAUDE_MODELS = ["claude-3-5-haiku-20241022", "claude-3-5-haiku-latest", "claude-3-5-sonnet-20240620", "claude-3-5-sonnet-20241022", "claude-3-5-sonnet-latest", "claude-3-7-sonnet-20250219", "claude-3-7-sonnet-latest", "claude-3-haiku-20240307", "claude-4-opus-20250514", "claude-4-sonnet-20250514", "claude-opus-4-1", "claude-opus-4-1-20250805", "claude-opus-4-20250514", "claude-sonnet-4-20250514"];
|
|
31
|
-
const SUPPPORTED_GPT_MODELS = exports.SUPPPORTED_GPT_MODELS = ["gpt-3.5-turbo", "gpt-3.5-turbo-0125", "gpt-3.5-turbo-0301", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-1106", "gpt-3.5-turbo-16k", "gpt-3.5-turbo-16k-0613", "gpt-3.5-turbo-instruct", "gpt-3.5-turbo-instruct-0914", "gpt-4", "gpt-4-0314", "gpt-4-0613", "gpt-4-32k", "gpt-4-32k-0314", "gpt-4-32k-0613", "gpt-4-turbo", "gpt-4-turbo-2024-04-09", "gpt-4.1", "gpt-4.1-2025-04-14", "gpt-4.1-mini", "gpt-4.1-mini-2025-04-14", "gpt-4.1-nano", "gpt-4.1-nano-2025-04-14", "gpt-4o", "gpt-4o-2024-05-13", "gpt-4o-2024-08-06", "gpt-4o-2024-11-20", "gpt-4o-mini", "gpt-4o-mini-2024-07-18", "gpt-5", "gpt-5-2025-08-07", "gpt-5-chat", "gpt-5-chat-latest", "gpt-5-mini", "gpt-5-mini-2025-08-07", "gpt-5-nano", "gpt-5-nano-2025-08-07", "o1", "o1-2024-12-17", "o1-mini", "o1-mini-2024-09-12", "o1-pro", "o1-pro-2025-03-19", "o3", "o3-2025-04-16", "o3-deep-research", "o3-deep-research-2025-06-26", "o3-mini", "o3-mini-2025-01-31", "o3-pro", "o3-pro-2025-06-10", "o4-mini", "o4-mini-2025-04-16", "o4-mini-deep-research", "o4-mini-deep-research-2025-06-26"];
|
|
32
|
-
const SUPPORTED_MODELS = exports.SUPPORTED_MODELS = [...SUPPPORTED_CLAUDE_MODELS, ...SUPPPORTED_GPT_MODELS];
|
|
13
|
+
};
|
|
@@ -8,10 +8,10 @@ var _anthropicModel = require("../models/anthropicModel");
|
|
|
8
8
|
var _neverthrow = require("neverthrow");
|
|
9
9
|
var Errors = _interopRequireWildcard(require("../types/errors"));
|
|
10
10
|
var _utils = require("./utils");
|
|
11
|
-
var _Logger = require("../../common/Logger");
|
|
12
11
|
var _aiModelsValidations = require("../common/aiModelsValidations");
|
|
13
12
|
function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
|
|
14
13
|
async function extractStructuredDataUsingClaude(input) {
|
|
14
|
+
var _unwrappedResponse$us, _unwrappedResponse$us2;
|
|
15
15
|
const {
|
|
16
16
|
entityName,
|
|
17
17
|
model,
|
|
@@ -56,8 +56,7 @@ async function extractStructuredDataUsingClaude(input) {
|
|
|
56
56
|
const anthropic = (0, _anthropicModel.createAnthropicInstance)({
|
|
57
57
|
apiKey
|
|
58
58
|
});
|
|
59
|
-
const
|
|
60
|
-
const maxTokens = _aiModelsValidations.MAX_TOKENS_OVERRIDES[modelName] ?? 4096;
|
|
59
|
+
const maxTokens = _aiModelsValidations.MAX_TOKENS_OVERRIDES[model] ?? 4096;
|
|
61
60
|
const response = await (0, _neverthrow.fromPromise)(anthropic.messages.create({
|
|
62
61
|
max_tokens: maxTokens,
|
|
63
62
|
temperature: 0,
|
|
@@ -66,7 +65,7 @@ async function extractStructuredDataUsingClaude(input) {
|
|
|
66
65
|
role: "user",
|
|
67
66
|
content
|
|
68
67
|
}],
|
|
69
|
-
model
|
|
68
|
+
model,
|
|
70
69
|
tools: [{
|
|
71
70
|
input_schema: processedJsonSchema,
|
|
72
71
|
name: toolName,
|
|
@@ -119,17 +118,11 @@ async function extractStructuredDataUsingClaude(input) {
|
|
|
119
118
|
}
|
|
120
119
|
const result = (0, _utils.getResultFromOutputSchema)(originalJsonSchema, entityName, tool.input);
|
|
121
120
|
const callCost = response.value.response.headers.get("x-ai-cost-in-cents");
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
_Logger.logger.info(`extractor ${input.identifier}: AI cost is not calculated (using custom API key)`);
|
|
125
|
-
} else if (callCost) {
|
|
126
|
-
const cost = parseFloat(callCost);
|
|
127
|
-
if (!isNaN(cost)) {
|
|
128
|
-
_Logger.logger.info(`extractor ${input.identifier}: AI cost is $${cost / 100}`);
|
|
129
|
-
}
|
|
130
|
-
}
|
|
131
|
-
}
|
|
121
|
+
const costInCents = (0, _utils.parseCostInCents)(callCost);
|
|
122
|
+
const totalTokens = (((_unwrappedResponse$us = unwrappedResponse.usage) === null || _unwrappedResponse$us === void 0 ? void 0 : _unwrappedResponse$us.input_tokens) ?? 0) + (((_unwrappedResponse$us2 = unwrappedResponse.usage) === null || _unwrappedResponse$us2 === void 0 ? void 0 : _unwrappedResponse$us2.output_tokens) ?? 0);
|
|
132
123
|
return (0, _neverthrow.ok)({
|
|
133
|
-
result
|
|
124
|
+
result,
|
|
125
|
+
costInCents,
|
|
126
|
+
totalTokens
|
|
134
127
|
});
|
|
135
128
|
}
|
|
@@ -15,10 +15,7 @@ async function extractStructuredDataUsingGoogle(input) {
|
|
|
15
15
|
if (!input.apiKey) {
|
|
16
16
|
return (0, _neverthrow.err)(Errors.invalidInput("Google AI is only supported with a custom API key. Please provide it or use a different AI provider."));
|
|
17
17
|
}
|
|
18
|
-
|
|
19
|
-
if (input.model in _aiModelsValidations.MODELS_MAPPINGS) {
|
|
20
|
-
model = _aiModelsValidations.MODELS_MAPPINGS[input.model];
|
|
21
|
-
}
|
|
18
|
+
const model = input.model;
|
|
22
19
|
const googleGenAi = (0, _google.createGoogleGenerativeAI)({
|
|
23
20
|
apiKey: input.apiKey
|
|
24
21
|
});
|
|
@@ -7,12 +7,10 @@ exports.extractStructuredDataUsingOpenAi = extractStructuredDataUsingOpenAi;
|
|
|
7
7
|
var _neverthrow = require("neverthrow");
|
|
8
8
|
var Errors = _interopRequireWildcard(require("../types/errors"));
|
|
9
9
|
var _utils = require("./utils");
|
|
10
|
-
var _Logger = require("../../common/Logger");
|
|
11
|
-
var _aiModelsValidations = require("../common/aiModelsValidations");
|
|
12
10
|
var _openaiModel = require("../models/openaiModel");
|
|
13
11
|
function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
|
|
14
12
|
async function extractStructuredDataUsingOpenAi(input) {
|
|
15
|
-
var _completion$value$dat, _completion$value$dat2;
|
|
13
|
+
var _completion$value$dat, _completion$value$dat2, _completion$value$dat3;
|
|
16
14
|
const {
|
|
17
15
|
entityName,
|
|
18
16
|
model,
|
|
@@ -50,14 +48,17 @@ async function extractStructuredDataUsingOpenAi(input) {
|
|
|
50
48
|
}));
|
|
51
49
|
content.push(...imageContent);
|
|
52
50
|
}
|
|
53
|
-
const modelName =
|
|
51
|
+
const modelName = input.model;
|
|
52
|
+
const supportsCustomTemperature = !/^(o\d|gpt-5)/i.test(modelName);
|
|
54
53
|
const toolName = `extract_${entityName}`;
|
|
55
54
|
const openAiInstance = (0, _openaiModel.createOpenAIInstance)({
|
|
56
55
|
apiKey
|
|
57
56
|
});
|
|
58
57
|
const completion = await (0, _neverthrow.fromPromise)(openAiInstance.chat.completions.create({
|
|
59
|
-
|
|
60
|
-
|
|
58
|
+
max_completion_tokens: 4000,
|
|
59
|
+
...(supportsCustomTemperature ? {
|
|
60
|
+
temperature: 0
|
|
61
|
+
} : {}),
|
|
61
62
|
model: modelName,
|
|
62
63
|
messages: [{
|
|
63
64
|
role: "system",
|
|
@@ -116,17 +117,11 @@ async function extractStructuredDataUsingOpenAi(input) {
|
|
|
116
117
|
const result = (0, _utils.getResultFromOutputSchema)(originalJsonSchema, entityName, parsedData.value);
|
|
117
118
|
const formatted = (0, _utils.cleanupAiResult)(result);
|
|
118
119
|
const callCost = completion.value.response.headers.get("x-ai-cost-in-cents");
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
_Logger.logger.info(`extractor ${input.identifier}: AI cost is not calculated (using custom API key)`);
|
|
122
|
-
} else if (callCost) {
|
|
123
|
-
const cost = parseFloat(callCost);
|
|
124
|
-
if (!isNaN(cost)) {
|
|
125
|
-
_Logger.logger.info(`extractor ${input.identifier}: AI cost is $${cost / 100}`);
|
|
126
|
-
}
|
|
127
|
-
}
|
|
128
|
-
}
|
|
120
|
+
const costInCents = (0, _utils.parseCostInCents)(callCost);
|
|
121
|
+
const totalTokens = (_completion$value$dat3 = completion.value.data.usage) === null || _completion$value$dat3 === void 0 ? void 0 : _completion$value$dat3.total_tokens;
|
|
129
122
|
return (0, _neverthrow.ok)({
|
|
130
|
-
result: formatted
|
|
123
|
+
result: formatted,
|
|
124
|
+
costInCents,
|
|
125
|
+
totalTokens
|
|
131
126
|
});
|
|
132
127
|
}
|
|
@@ -7,11 +7,10 @@ exports.extractStructuredDataUsingAiInstance = extractStructuredDataUsingAiInsta
|
|
|
7
7
|
var _neverthrow = require("neverthrow");
|
|
8
8
|
var Errors = _interopRequireWildcard(require("../types/errors"));
|
|
9
9
|
var _utils = require("./utils");
|
|
10
|
-
var _Logger = require("../../common/Logger");
|
|
11
10
|
var _ai = require("ai");
|
|
12
11
|
function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
|
|
13
12
|
async function extractStructuredDataUsingAiInstance(input) {
|
|
14
|
-
var _apiResult$value$tool, _apiResult$value$tool2, _apiResult$value$resp;
|
|
13
|
+
var _apiResult$value$tool, _apiResult$value$tool2, _apiResult$value$resp, _apiResult$value$usag;
|
|
15
14
|
const {
|
|
16
15
|
entityName,
|
|
17
16
|
model,
|
|
@@ -20,9 +19,7 @@ async function extractStructuredDataUsingAiInstance(input) {
|
|
|
20
19
|
text,
|
|
21
20
|
extraUserMessages,
|
|
22
21
|
images,
|
|
23
|
-
|
|
24
|
-
apiName,
|
|
25
|
-
maxTokens
|
|
22
|
+
apiName
|
|
26
23
|
} = input;
|
|
27
24
|
const processedJsonSchema = (0, _utils.processInputSchema)(originalJsonSchema, entityName);
|
|
28
25
|
const content = [];
|
|
@@ -106,17 +103,11 @@ async function extractStructuredDataUsingAiInstance(input) {
|
|
|
106
103
|
const result = (0, _utils.getResultFromOutputSchema)(originalJsonSchema, entityName, extractedData);
|
|
107
104
|
const formatted = (0, _utils.cleanupAiResult)(result);
|
|
108
105
|
const callCost = (_apiResult$value$resp = apiResult.value.response.headers) === null || _apiResult$value$resp === void 0 ? void 0 : _apiResult$value$resp["x-ai-cost-in-cents"];
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
_Logger.logger.info(`extractor ${input.identifier}: AI cost is not calculated (using custom API key)`);
|
|
112
|
-
} else if (callCost) {
|
|
113
|
-
const cost = parseFloat(callCost);
|
|
114
|
-
if (!isNaN(cost)) {
|
|
115
|
-
_Logger.logger.info(`extractor ${input.identifier}: AI cost is $${cost / 100}`);
|
|
116
|
-
}
|
|
117
|
-
}
|
|
118
|
-
}
|
|
106
|
+
const costInCents = (0, _utils.parseCostInCents)(callCost);
|
|
107
|
+
const totalTokens = (_apiResult$value$usag = apiResult.value.usage) === null || _apiResult$value$usag === void 0 ? void 0 : _apiResult$value$usag.totalTokens;
|
|
119
108
|
return (0, _neverthrow.ok)({
|
|
120
|
-
result: formatted
|
|
109
|
+
result: formatted,
|
|
110
|
+
costInCents,
|
|
111
|
+
totalTokens
|
|
121
112
|
});
|
|
122
113
|
}
|
|
@@ -10,7 +10,7 @@ var _imageSize = require("image-size");
|
|
|
10
10
|
var _neverthrow = require("neverthrow");
|
|
11
11
|
var Errors = _interopRequireWildcard(require("../types/errors"));
|
|
12
12
|
function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
|
|
13
|
-
async function getTableHeadersUsingAi(handle,
|
|
13
|
+
async function getTableHeadersUsingAi(handle, model) {
|
|
14
14
|
var _response$error;
|
|
15
15
|
let image;
|
|
16
16
|
try {
|
|
@@ -57,7 +57,7 @@ async function getTableHeadersUsingAi(handle, identifier) {
|
|
|
57
57
|
}
|
|
58
58
|
}]
|
|
59
59
|
}],
|
|
60
|
-
model
|
|
60
|
+
model,
|
|
61
61
|
tools: [{
|
|
62
62
|
input_schema: {
|
|
63
63
|
type: "object",
|
|
@@ -13,6 +13,7 @@ var _extractStructuredDataUsingOpenAi = require("./extractStructuredDataUsingOpe
|
|
|
13
13
|
var _utils = require("./utils");
|
|
14
14
|
var _extractStructuredDataUsingGoogle = require("./extractStructuredDataUsingGoogle");
|
|
15
15
|
var _getModelProvider = require("../../common/getModelProvider");
|
|
16
|
+
var _Logger = require("../../common/Logger");
|
|
16
17
|
function isClaudeModel(model) {
|
|
17
18
|
return (0, _getModelProvider.getModelProvider)(model) === "anthropic";
|
|
18
19
|
}
|
|
@@ -47,9 +48,18 @@ async function extractStructuredDataUsingAi(input) {
|
|
|
47
48
|
return (0, _neverthrow.err)(extractionResult.error);
|
|
48
49
|
}
|
|
49
50
|
const {
|
|
50
|
-
result
|
|
51
|
+
result,
|
|
52
|
+
costInCents,
|
|
53
|
+
totalTokens
|
|
51
54
|
} = extractionResult.value;
|
|
55
|
+
if (costInCents !== undefined) {
|
|
56
|
+
_Logger.logger.info(`Total LLM Cost In Cents: ${costInCents}`);
|
|
57
|
+
} else if (totalTokens !== undefined) {
|
|
58
|
+
_Logger.logger.info(`Total LLM Tokens: ${totalTokens}`);
|
|
59
|
+
}
|
|
52
60
|
return (0, _neverthrow.ok)({
|
|
53
|
-
result: (0, _utils.cleanupAiResult)(result)
|
|
61
|
+
result: (0, _utils.cleanupAiResult)(result),
|
|
62
|
+
costInCents,
|
|
63
|
+
totalTokens
|
|
54
64
|
});
|
|
55
65
|
}
|
|
@@ -9,7 +9,7 @@ var _zod = require("zod");
|
|
|
9
9
|
var _neverthrow = require("neverthrow");
|
|
10
10
|
var Errors = _interopRequireWildcard(require("../types/errors"));
|
|
11
11
|
function _interopRequireWildcard(e, t) { if ("function" == typeof WeakMap) var r = new WeakMap(), n = new WeakMap(); return (_interopRequireWildcard = function (e, t) { if (!t && e && e.__esModule) return e; var o, i, f = { __proto__: null, default: e }; if (null === e || "object" != typeof e && "function" != typeof e) return f; if (o = t ? n : r) { if (o.has(e)) return o.get(e); o.set(e, f); } for (const t in e) "default" !== t && {}.hasOwnProperty.call(e, t) && ((i = (o = Object.defineProperty) && Object.getOwnPropertyDescriptor(e, t)) && (i.get || i.set) ? o(f, t, i) : f[t] = e[t]); return f; })(e, t); }
|
|
12
|
-
async function isTableHeaderOrFooter(content) {
|
|
12
|
+
async function isTableHeaderOrFooter(content, model) {
|
|
13
13
|
var _response$error;
|
|
14
14
|
if (!content) {
|
|
15
15
|
return (0, _neverthrow.ok)({
|
|
@@ -39,7 +39,7 @@ async function isTableHeaderOrFooter(content) {
|
|
|
39
39
|
a table header is a row that contains labels for table columns, and footer usually has pagination information or summary of the table`
|
|
40
40
|
}, itemContent]
|
|
41
41
|
}],
|
|
42
|
-
model
|
|
42
|
+
model,
|
|
43
43
|
tools: [{
|
|
44
44
|
input_schema: {
|
|
45
45
|
type: "object",
|
|
@@ -6,11 +6,17 @@ Object.defineProperty(exports, "__esModule", {
|
|
|
6
6
|
exports.cleanupAiResult = cleanupAiResult;
|
|
7
7
|
exports.getRandomItems = getRandomItems;
|
|
8
8
|
exports.getResultFromOutputSchema = getResultFromOutputSchema;
|
|
9
|
+
exports.parseCostInCents = parseCostInCents;
|
|
9
10
|
exports.processInputSchema = processInputSchema;
|
|
10
11
|
function getRandomItems(arr, numItems) {
|
|
11
12
|
const shuffled = arr.sort(() => 0.5 - Math.random());
|
|
12
13
|
return shuffled.slice(0, numItems);
|
|
13
14
|
}
|
|
15
|
+
function parseCostInCents(headerValue) {
|
|
16
|
+
if (!headerValue) return undefined;
|
|
17
|
+
const cost = parseFloat(headerValue);
|
|
18
|
+
return isNaN(cost) ? undefined : cost;
|
|
19
|
+
}
|
|
14
20
|
function processInputSchema(originalJsonSchema, entityName) {
|
|
15
21
|
const internalSchema = structuredClone(originalJsonSchema);
|
|
16
22
|
delete internalSchema.description;
|
|
@@ -209,7 +209,7 @@ async function splitDomAndExtractData({
|
|
|
209
209
|
tableLocater
|
|
210
210
|
} = await (0, _tablesUtils.isListTable)(listItemsContainerLocator, itemsSimplifiedHtml);
|
|
211
211
|
const tableAsJsonArray = isTable ? await (0, _tablesUtils.createJsonFromTable)(pageAndSearchRegion.page) : [];
|
|
212
|
-
const tableHeaders = tableLocater ? await (0, _findTableHeaders.getTableHeadersUsingAi)(tableLocater,
|
|
212
|
+
const tableHeaders = tableLocater ? await (0, _findTableHeaders.getTableHeadersUsingAi)(tableLocater, strategy.model) : undefined;
|
|
213
213
|
if (tableHeaders && tableHeaders.isErr()) {
|
|
214
214
|
return (0, _neverthrow.err)(tableHeaders.error);
|
|
215
215
|
}
|
package/dist/optimized-extractors/listExtractionHelpers/utils/extractPropertiesUsingGPTFromArray.js
CHANGED
|
@@ -44,7 +44,7 @@ async function extractPropertiesUsingGPT({
|
|
|
44
44
|
}
|
|
45
45
|
if (possibleTableHeaderOrFooter) {
|
|
46
46
|
const content = text ?? image;
|
|
47
|
-
const isHeader = await (0, _isTableHeaderOrFooter.isTableHeaderOrFooter)(content);
|
|
47
|
+
const isHeader = await (0, _isTableHeaderOrFooter.isTableHeaderOrFooter)(content, strategy.model);
|
|
48
48
|
if (isHeader.isErr()) {
|
|
49
49
|
return (0, _neverthrow.err)(isHeader.error);
|
|
50
50
|
}
|
|
@@ -146,7 +146,7 @@ async function extractPropertiesWithHTMLStrategy({
|
|
|
146
146
|
apiKey
|
|
147
147
|
}) {
|
|
148
148
|
const shouldUseTableData = !!tableAsJsonArray && tableAsJsonArray.length === items.length;
|
|
149
|
-
const isWeakModel = strategy.model
|
|
149
|
+
const isWeakModel = strategy.model.includes("haiku") || strategy.model.includes("turbo");
|
|
150
150
|
const averageItemLength = items.reduce((sum, item) => {
|
|
151
151
|
if (item.type !== "text") return sum;
|
|
152
152
|
return sum + (0, _extractionHelpers.compressStringSpaces)(item.text).length;
|
|
@@ -113,7 +113,7 @@ const strategySchema = exports.strategySchema = _zod.z.union([htmlStrategySchema
|
|
|
113
113
|
};
|
|
114
114
|
}
|
|
115
115
|
}).optional().default({
|
|
116
|
-
model: "claude-
|
|
116
|
+
model: "claude-haiku-4-5-20251001",
|
|
117
117
|
type: "HTML"
|
|
118
118
|
});
|
|
119
119
|
const labelSchema = _zod.z.string({
|
package/package.json
CHANGED
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
|
|
3
|
-
Object.defineProperty(exports, "__esModule", {
|
|
4
|
-
value: true
|
|
5
|
-
});
|
|
6
|
-
exports.SUPPORTED_VISION_MODELS = exports.SUPPORTED_TEXT_MODELS = exports.SUPPORTED_GPT_MODELS = exports.SUPPORTED_GOOGLE_MODELS = exports.SUPPORTED_CLAUDE_MODELS = exports.MODELS_MAPPINGS = exports.MAX_TOKENS_OVERRIDES = exports.GPT_MODELS_MAPPINGS = exports.GOOGLE_MODELS_MAPPINGS = exports.CLAUDE_VISION_SUPPORTED_MODELS = exports.CLAUDE_ONLY_TEXT_MODELS = exports.CLAUDE_MODELS_MAPPINGS = void 0;
|
|
7
|
-
const CLAUDE_ONLY_TEXT_MODELS = exports.CLAUDE_ONLY_TEXT_MODELS = ["claude-3-5-haiku", "claude-3-5-haiku-20241022"];
|
|
8
|
-
const CLAUDE_VISION_SUPPORTED_MODELS = exports.CLAUDE_VISION_SUPPORTED_MODELS = ["claude-3-haiku", "claude-3-haiku-20240307", "claude-3.5-sonnet", "claude-3-5-sonnet-20240620", "claude-3-5-sonnet-20241022", "claude-opus-4", "claude-opus-4-20250514", "claude-sonnet-4", "claude-sonnet-4-20250514"];
|
|
9
|
-
const SUPPORTED_CLAUDE_MODELS = exports.SUPPORTED_CLAUDE_MODELS = [...CLAUDE_ONLY_TEXT_MODELS, ...CLAUDE_VISION_SUPPORTED_MODELS];
|
|
10
|
-
const CLAUDE_MODELS_MAPPINGS = exports.CLAUDE_MODELS_MAPPINGS = {
|
|
11
|
-
"claude-3-haiku": "claude-3-haiku-20240307",
|
|
12
|
-
"claude-3-5-haiku": "claude-3-5-haiku-20241022",
|
|
13
|
-
"claude-3.5-sonnet": "claude-3-5-sonnet-20241022",
|
|
14
|
-
"claude-opus-4": "claude-opus-4-20250514",
|
|
15
|
-
"claude-sonnet-4": "claude-sonnet-4-20250514"
|
|
16
|
-
};
|
|
17
|
-
const GPT_ONLY_TEXT_GPT_MODELS = ["gpt3.5-turbo", "gpt-3.5-turbo-0125"];
|
|
18
|
-
const GPT_VISION_SUPPORTED_MODELS = ["gpt4-turbo", "gpt-4-turbo-2024-04-09", "gpt-4o", "gpt-4o-2024-05-13", "gpt-4o-mini", "gpt-4o-mini-2024-07-18"];
|
|
19
|
-
const SUPPORTED_GPT_MODELS = exports.SUPPORTED_GPT_MODELS = [...GPT_ONLY_TEXT_GPT_MODELS, ...GPT_VISION_SUPPORTED_MODELS];
|
|
20
|
-
const GPT_MODELS_MAPPINGS = exports.GPT_MODELS_MAPPINGS = {
|
|
21
|
-
"gpt4-turbo": "gpt-4-turbo-2024-04-09",
|
|
22
|
-
"gpt3.5-turbo": "gpt-3.5-turbo-0125",
|
|
23
|
-
"gpt-4o": "gpt-4o-2024-05-13",
|
|
24
|
-
"gpt-4o-mini": "gpt-4o-mini-2024-07-18"
|
|
25
|
-
};
|
|
26
|
-
const SUPPORTED_GOOGLE_MODELS = exports.SUPPORTED_GOOGLE_MODELS = ["gemini-1.5-pro", "gemini-1.5-pro-002", "gemini-1.5-flash-8b", "gemini-1.5-flash-8b-002", "gemini-1.5-flash", "gemini-1.5-flash-002", "gemini-2.0-flash-exp"];
|
|
27
|
-
const GOOGLE_MODELS_MAPPINGS = exports.GOOGLE_MODELS_MAPPINGS = {
|
|
28
|
-
"gemini-1.5-pro": "gemini-1.5-pro-002",
|
|
29
|
-
"gemini-1.5-flash-8b": "gemini-1.5-flash-8b-002",
|
|
30
|
-
"gemini-1.5-flash": "gemini-1.5-flash-002"
|
|
31
|
-
};
|
|
32
|
-
const SUPPORTED_TEXT_MODELS = exports.SUPPORTED_TEXT_MODELS = [...SUPPORTED_CLAUDE_MODELS, ...SUPPORTED_GPT_MODELS, ...SUPPORTED_GOOGLE_MODELS];
|
|
33
|
-
const SUPPORTED_VISION_MODELS = exports.SUPPORTED_VISION_MODELS = [...CLAUDE_VISION_SUPPORTED_MODELS, ...GPT_VISION_SUPPORTED_MODELS, ...SUPPORTED_GOOGLE_MODELS];
|
|
34
|
-
const MODELS_MAPPINGS = exports.MODELS_MAPPINGS = {
|
|
35
|
-
...GPT_MODELS_MAPPINGS,
|
|
36
|
-
...CLAUDE_MODELS_MAPPINGS,
|
|
37
|
-
...GOOGLE_MODELS_MAPPINGS
|
|
38
|
-
};
|
|
39
|
-
const MAX_TOKENS_OVERRIDES = exports.MAX_TOKENS_OVERRIDES = {
|
|
40
|
-
"claude-3-5-sonnet-20240620": 8192,
|
|
41
|
-
"gemini-1.5-pro-002": 8192,
|
|
42
|
-
"gemini-1.5-flash-8b-002": 8192,
|
|
43
|
-
"gemini-1.5-flash-002": 8192,
|
|
44
|
-
"gemini-2.0-flash-exp": 8192
|
|
45
|
-
};
|