@retab/node 0.0.48 → 0.0.52
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -215
- package/dist/api/client.d.ts +2 -2
- package/dist/api/client.d.ts.map +1 -1
- package/dist/api/client.js +2 -2
- package/dist/api/documents/client.d.ts +3 -3
- package/dist/api/documents/client.d.ts.map +1 -1
- package/dist/api/documents/client.js +3 -3
- package/dist/api/projects/client.d.ts +15 -0
- package/dist/api/projects/client.d.ts.map +1 -0
- package/dist/api/projects/client.js +43 -0
- package/dist/api/projects/documents/client.d.ts +12 -0
- package/dist/api/projects/documents/client.d.ts.map +1 -0
- package/dist/api/projects/documents/client.js +39 -0
- package/dist/api/projects/iterations/client.d.ts +17 -0
- package/dist/api/projects/iterations/client.d.ts.map +1 -0
- package/dist/api/projects/iterations/client.js +64 -0
- package/dist/client.d.ts +1 -0
- package/dist/client.d.ts.map +1 -1
- package/dist/client.js +6 -1
- package/dist/generated_types.d.ts +17837 -40090
- package/dist/generated_types.d.ts.map +1 -1
- package/dist/generated_types.js +309 -979
- package/dist/index.d.ts +7 -2
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -2
- package/dist/types.d.ts +188 -80
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +22 -1
- package/package.json +6 -9
- package/dist/api/consensus/client.d.ts +0 -7
- package/dist/api/consensus/client.d.ts.map +0 -1
- package/dist/api/consensus/client.js +0 -14
- package/dist/errors.d.ts +0 -34
- package/dist/errors.d.ts.map +0 -1
- package/dist/errors.js +0 -53
- package/dist/resource.d.ts +0 -12
- package/dist/resource.d.ts.map +0 -1
- package/dist/resource.js +0 -19
- package/dist/resources/consensus/completions.d.ts +0 -66
- package/dist/resources/consensus/completions.d.ts.map +0 -1
- package/dist/resources/consensus/completions.js +0 -84
- package/dist/resources/consensus/index.d.ts +0 -72
- package/dist/resources/consensus/index.d.ts.map +0 -1
- package/dist/resources/consensus/index.js +0 -76
- package/dist/resources/consensus/responses.d.ts +0 -69
- package/dist/resources/consensus/responses.d.ts.map +0 -1
- package/dist/resources/consensus/responses.js +0 -99
- package/dist/resources/documents/extractions.d.ts +0 -74
- package/dist/resources/documents/extractions.d.ts.map +0 -1
- package/dist/resources/documents/extractions.js +0 -196
- package/dist/resources/documents/index.d.ts +0 -21
- package/dist/resources/documents/index.d.ts.map +0 -1
- package/dist/resources/documents/index.js +0 -55
- package/dist/resources/evaluations/documents.d.ts +0 -40
- package/dist/resources/evaluations/documents.d.ts.map +0 -1
- package/dist/resources/evaluations/documents.js +0 -123
- package/dist/resources/evaluations/index.d.ts +0 -14
- package/dist/resources/evaluations/index.d.ts.map +0 -1
- package/dist/resources/evaluations/index.js +0 -17
- package/dist/resources/evaluations/iterations.d.ts +0 -50
- package/dist/resources/evaluations/iterations.d.ts.map +0 -1
- package/dist/resources/evaluations/iterations.js +0 -156
- package/dist/resources/files.d.ts +0 -82
- package/dist/resources/files.d.ts.map +0 -1
- package/dist/resources/files.js +0 -150
- package/dist/resources/finetuning.d.ts +0 -105
- package/dist/resources/finetuning.d.ts.map +0 -1
- package/dist/resources/finetuning.js +0 -181
- package/dist/resources/index.d.ts +0 -11
- package/dist/resources/index.d.ts.map +0 -1
- package/dist/resources/index.js +0 -10
- package/dist/resources/models.d.ts +0 -57
- package/dist/resources/models.d.ts.map +0 -1
- package/dist/resources/models.js +0 -72
- package/dist/resources/processors/automations/endpoints.d.ts +0 -90
- package/dist/resources/processors/automations/endpoints.d.ts.map +0 -1
- package/dist/resources/processors/automations/endpoints.js +0 -145
- package/dist/resources/processors/automations/index.d.ts +0 -7
- package/dist/resources/processors/automations/index.d.ts.map +0 -1
- package/dist/resources/processors/automations/index.js +0 -6
- package/dist/resources/processors/automations/links.d.ts +0 -90
- package/dist/resources/processors/automations/links.d.ts.map +0 -1
- package/dist/resources/processors/automations/links.js +0 -149
- package/dist/resources/processors/automations/logs.d.ts +0 -35
- package/dist/resources/processors/automations/logs.d.ts.map +0 -1
- package/dist/resources/processors/automations/logs.js +0 -60
- package/dist/resources/processors/automations/mailboxes.d.ts +0 -102
- package/dist/resources/processors/automations/mailboxes.d.ts.map +0 -1
- package/dist/resources/processors/automations/mailboxes.js +0 -157
- package/dist/resources/processors/automations/outlook.d.ts +0 -114
- package/dist/resources/processors/automations/outlook.d.ts.map +0 -1
- package/dist/resources/processors/automations/outlook.js +0 -170
- package/dist/resources/processors/automations/tests.d.ts +0 -58
- package/dist/resources/processors/automations/tests.d.ts.map +0 -1
- package/dist/resources/processors/automations/tests.js +0 -90
- package/dist/resources/processors/index.d.ts +0 -303
- package/dist/resources/processors/index.d.ts.map +0 -1
- package/dist/resources/processors/index.js +0 -261
- package/dist/resources/schemas.d.ts +0 -63
- package/dist/resources/schemas.d.ts.map +0 -1
- package/dist/resources/schemas.js +0 -183
- package/dist/resources/secrets/external_api_keys.d.ts +0 -61
- package/dist/resources/secrets/external_api_keys.d.ts.map +0 -1
- package/dist/resources/secrets/external_api_keys.js +0 -120
- package/dist/resources/secrets/index.d.ts +0 -14
- package/dist/resources/secrets/index.d.ts.map +0 -1
- package/dist/resources/secrets/index.js +0 -17
- package/dist/resources/secrets/webhooks.d.ts +0 -73
- package/dist/resources/secrets/webhooks.d.ts.map +0 -1
- package/dist/resources/secrets/webhooks.js +0 -145
- package/dist/resources/usage.d.ts +0 -223
- package/dist/resources/usage.d.ts.map +0 -1
- package/dist/resources/usage.js +0 -310
- package/dist/types/ai_models.d.ts +0 -389
- package/dist/types/ai_models.d.ts.map +0 -1
- package/dist/types/ai_models.js +0 -145
- package/dist/types/automations/cron.d.ts +0 -28
- package/dist/types/automations/cron.d.ts.map +0 -1
- package/dist/types/automations/cron.js +0 -1
- package/dist/types/automations/endpoints.d.ts +0 -13
- package/dist/types/automations/endpoints.d.ts.map +0 -1
- package/dist/types/automations/endpoints.js +0 -1
- package/dist/types/automations/index.d.ts +0 -7
- package/dist/types/automations/index.d.ts.map +0 -1
- package/dist/types/automations/index.js +0 -6
- package/dist/types/automations/links.d.ts +0 -15
- package/dist/types/automations/links.d.ts.map +0 -1
- package/dist/types/automations/links.js +0 -1
- package/dist/types/automations/mailboxes.d.ts +0 -18
- package/dist/types/automations/mailboxes.d.ts.map +0 -1
- package/dist/types/automations/mailboxes.js +0 -1
- package/dist/types/automations/outlook.d.ts +0 -37
- package/dist/types/automations/outlook.d.ts.map +0 -1
- package/dist/types/automations/outlook.js +0 -1
- package/dist/types/automations/webhooks.d.ts +0 -13
- package/dist/types/automations/webhooks.d.ts.map +0 -1
- package/dist/types/automations/webhooks.js +0 -1
- package/dist/types/browser_canvas.d.ts +0 -4
- package/dist/types/browser_canvas.d.ts.map +0 -1
- package/dist/types/browser_canvas.js +0 -2
- package/dist/types/chat.d.ts +0 -99
- package/dist/types/chat.d.ts.map +0 -1
- package/dist/types/chat.js +0 -20
- package/dist/types/consensus.d.ts +0 -10
- package/dist/types/consensus.d.ts.map +0 -1
- package/dist/types/consensus.js +0 -1
- package/dist/types/db/annotations.d.ts +0 -108
- package/dist/types/db/annotations.d.ts.map +0 -1
- package/dist/types/db/annotations.js +0 -6
- package/dist/types/db/files.d.ts +0 -133
- package/dist/types/db/files.d.ts.map +0 -1
- package/dist/types/db/files.js +0 -5
- package/dist/types/documents/extractions.d.ts +0 -1849
- package/dist/types/documents/extractions.d.ts.map +0 -1
- package/dist/types/documents/extractions.js +0 -211
- package/dist/types/documents/processing.d.ts +0 -249
- package/dist/types/documents/processing.d.ts.map +0 -1
- package/dist/types/documents/processing.js +0 -6
- package/dist/types/evaluations/iterations.d.ts +0 -41
- package/dist/types/evaluations/iterations.d.ts.map +0 -1
- package/dist/types/evaluations/iterations.js +0 -1
- package/dist/types/jobs/base.d.ts +0 -162
- package/dist/types/jobs/base.d.ts.map +0 -1
- package/dist/types/jobs/base.js +0 -6
- package/dist/types/jobs/specialized.d.ts +0 -200
- package/dist/types/jobs/specialized.d.ts.map +0 -1
- package/dist/types/jobs/specialized.js +0 -37
- package/dist/types/logs.d.ts +0 -92
- package/dist/types/logs.d.ts.map +0 -1
- package/dist/types/logs.js +0 -1
- package/dist/types/mime.d.ts +0 -426
- package/dist/types/mime.d.ts.map +0 -1
- package/dist/types/mime.js +0 -48
- package/dist/types/modalities.d.ts +0 -31
- package/dist/types/modalities.d.ts.map +0 -1
- package/dist/types/modalities.js +0 -109
- package/dist/types/pagination.d.ts +0 -5
- package/dist/types/pagination.d.ts.map +0 -1
- package/dist/types/pagination.js +0 -1
- package/dist/types/schemas/enhancement.d.ts +0 -250
- package/dist/types/schemas/enhancement.d.ts.map +0 -1
- package/dist/types/schemas/enhancement.js +0 -6
- package/dist/types/schemas/generate.d.ts +0 -160
- package/dist/types/schemas/generate.d.ts.map +0 -1
- package/dist/types/schemas/generate.js +0 -19
- package/dist/types/schemas/object.d.ts +0 -116
- package/dist/types/schemas/object.d.ts.map +0 -1
- package/dist/types/schemas/object.js +0 -861
- package/dist/types/secrets/external_api_keys.d.ts +0 -27
- package/dist/types/secrets/external_api_keys.d.ts.map +0 -1
- package/dist/types/secrets/external_api_keys.js +0 -11
- package/dist/types/secrets/index.d.ts +0 -2
- package/dist/types/secrets/index.d.ts.map +0 -1
- package/dist/types/secrets/index.js +0 -1
- package/dist/types/standards.d.ts +0 -37
- package/dist/types/standards.d.ts.map +0 -1
- package/dist/types/standards.js +0 -1
- package/dist/utils/ai_models.d.ts +0 -10
- package/dist/utils/ai_models.d.ts.map +0 -1
- package/dist/utils/ai_models.js +0 -183
- package/dist/utils/batch_processing.d.ts +0 -227
- package/dist/utils/batch_processing.d.ts.map +0 -1
- package/dist/utils/batch_processing.js +0 -268
- package/dist/utils/benchmarking.d.ts +0 -115
- package/dist/utils/benchmarking.d.ts.map +0 -1
- package/dist/utils/benchmarking.js +0 -355
- package/dist/utils/chat.d.ts +0 -70
- package/dist/utils/chat.d.ts.map +0 -1
- package/dist/utils/chat.js +0 -79
- package/dist/utils/cost_calculation.d.ts +0 -26
- package/dist/utils/cost_calculation.d.ts.map +0 -1
- package/dist/utils/cost_calculation.js +0 -99
- package/dist/utils/datasets.d.ts +0 -135
- package/dist/utils/datasets.d.ts.map +0 -1
- package/dist/utils/datasets.js +0 -359
- package/dist/utils/display.d.ts +0 -108
- package/dist/utils/display.d.ts.map +0 -1
- package/dist/utils/display.js +0 -244
- package/dist/utils/hash.d.ts +0 -18
- package/dist/utils/hash.d.ts.map +0 -1
- package/dist/utils/hash.js +0 -31
- package/dist/utils/hashing.d.ts +0 -18
- package/dist/utils/hashing.d.ts.map +0 -1
- package/dist/utils/hashing.js +0 -28
- package/dist/utils/index.d.ts +0 -8
- package/dist/utils/index.d.ts.map +0 -1
- package/dist/utils/index.js +0 -10
- package/dist/utils/json_schema.d.ts +0 -18
- package/dist/utils/json_schema.d.ts.map +0 -1
- package/dist/utils/json_schema.js +0 -334
- package/dist/utils/json_schema_utils.d.ts +0 -42
- package/dist/utils/json_schema_utils.d.ts.map +0 -1
- package/dist/utils/json_schema_utils.js +0 -212
- package/dist/utils/jsonl.d.ts +0 -60
- package/dist/utils/jsonl.d.ts.map +0 -1
- package/dist/utils/jsonl.js +0 -259
- package/dist/utils/mime.d.ts +0 -6
- package/dist/utils/mime.d.ts.map +0 -1
- package/dist/utils/mime.js +0 -129
- package/dist/utils/model_cards.d.ts +0 -219
- package/dist/utils/model_cards.d.ts.map +0 -1
- package/dist/utils/model_cards.js +0 -462
- package/dist/utils/prompt_optimization.d.ts +0 -96
- package/dist/utils/prompt_optimization.d.ts.map +0 -1
- package/dist/utils/prompt_optimization.js +0 -275
- package/dist/utils/responses.d.ts +0 -35
- package/dist/utils/responses.d.ts.map +0 -1
- package/dist/utils/responses.js +0 -37
- package/dist/utils/stream.d.ts +0 -13
- package/dist/utils/stream.d.ts.map +0 -1
- package/dist/utils/stream.js +0 -64
- package/dist/utils/stream_context_managers.d.ts +0 -147
- package/dist/utils/stream_context_managers.d.ts.map +0 -1
- package/dist/utils/stream_context_managers.js +0 -380
- package/dist/utils/usage.d.ts +0 -57
- package/dist/utils/usage.d.ts.map +0 -1
- package/dist/utils/usage.js +0 -97
- package/dist/utils/webhook_secrets.d.ts +0 -59
- package/dist/utils/webhook_secrets.d.ts.map +0 -1
- package/dist/utils/webhook_secrets.js +0 -107
- package/dist/utils/zod_to_json_schema.d.ts +0 -11
- package/dist/utils/zod_to_json_schema.d.ts.map +0 -1
- package/dist/utils/zod_to_json_schema.js +0 -123
package/dist/utils/datasets.d.ts
DELETED
|
@@ -1,135 +0,0 @@
|
|
|
1
|
-
import { SyncAPIResource, AsyncAPIResource } from '../resource.js';
|
|
2
|
-
import { DatasetMetrics } from './display.js';
|
|
3
|
-
/**
|
|
4
|
-
* Advanced Dataset management utilities for ML training workflows
|
|
5
|
-
* Equivalent to Python's jsonlUtils.py
|
|
6
|
-
*/
|
|
7
|
-
export interface FinetuningJSON {
|
|
8
|
-
messages: Array<{
|
|
9
|
-
role: 'system' | 'user' | 'assistant';
|
|
10
|
-
content: string;
|
|
11
|
-
}>;
|
|
12
|
-
}
|
|
13
|
-
export interface DocumentAnnotationPair {
|
|
14
|
-
document: string | Buffer;
|
|
15
|
-
annotation: Record<string, any>;
|
|
16
|
-
}
|
|
17
|
-
export interface BatchJSONLRequest {
|
|
18
|
-
custom_id: string;
|
|
19
|
-
method: 'POST';
|
|
20
|
-
url: string;
|
|
21
|
-
body: Record<string, any>;
|
|
22
|
-
}
|
|
23
|
-
export interface BatchJSONLResponse {
|
|
24
|
-
id: string;
|
|
25
|
-
custom_id: string;
|
|
26
|
-
response: {
|
|
27
|
-
status_code: number;
|
|
28
|
-
request_id: string;
|
|
29
|
-
body: Record<string, any>;
|
|
30
|
-
};
|
|
31
|
-
error?: {
|
|
32
|
-
code: string;
|
|
33
|
-
message: string;
|
|
34
|
-
};
|
|
35
|
-
}
|
|
36
|
-
export interface AnnotationOptions {
|
|
37
|
-
model?: string;
|
|
38
|
-
temperature?: number;
|
|
39
|
-
modality?: 'native' | 'text';
|
|
40
|
-
maxConcurrency?: number;
|
|
41
|
-
reasoning_effort?: 'low' | 'medium' | 'high';
|
|
42
|
-
provider?: 'openai' | 'anthropic' | 'xai' | 'gemini';
|
|
43
|
-
idempotencyKey?: string;
|
|
44
|
-
}
|
|
45
|
-
export interface SaveOptions {
|
|
46
|
-
modality?: 'native' | 'text';
|
|
47
|
-
imageResolutionDpi?: number;
|
|
48
|
-
browserCanvas?: 'A3' | 'A4' | 'A5';
|
|
49
|
-
}
|
|
50
|
-
export declare class BaseDatasetsMixin {
|
|
51
|
-
/**
|
|
52
|
-
* Process dataset and compute comprehensive metrics
|
|
53
|
-
*/
|
|
54
|
-
pprint(datasetPath: string, inputTokenPrice?: number, outputTokenPrice?: number): Promise<DatasetMetrics>;
|
|
55
|
-
/**
|
|
56
|
-
* Save document-annotation pairs as JSONL training dataset
|
|
57
|
-
*/
|
|
58
|
-
save(jsonSchema: Record<string, any> | string, documentAnnotationPairsPaths: Array<{
|
|
59
|
-
document: string;
|
|
60
|
-
annotation: string;
|
|
61
|
-
}>, datasetPath: string, options?: SaveOptions): Promise<void>;
|
|
62
|
-
/**
|
|
63
|
-
* Change schema in existing dataset
|
|
64
|
-
*/
|
|
65
|
-
changeSchema(inputDatasetPath: string, jsonSchema: Record<string, any> | string, outputDatasetPath?: string, inplace?: boolean): Promise<void>;
|
|
66
|
-
/**
|
|
67
|
-
* Stitch multiple documents and save as dataset
|
|
68
|
-
*/
|
|
69
|
-
stitchAndSave(jsonSchema: Record<string, any> | string, pairsPaths: Array<{
|
|
70
|
-
documents: string[];
|
|
71
|
-
annotation: string;
|
|
72
|
-
}>, datasetPath: string, modality?: 'native' | 'text'): Promise<void>;
|
|
73
|
-
/**
|
|
74
|
-
* Generate annotations for documents using AI models
|
|
75
|
-
*/
|
|
76
|
-
annotate(jsonSchema: Record<string, any> | string, documents: string[], datasetPath: string, options?: AnnotationOptions): Promise<void>;
|
|
77
|
-
/**
|
|
78
|
-
* Update existing annotations with new model/schema
|
|
79
|
-
*/
|
|
80
|
-
updateAnnotations(jsonSchema: Record<string, any> | string, oldDatasetPath: string, newDatasetPath: string, options?: AnnotationOptions): Promise<void>;
|
|
81
|
-
/**
|
|
82
|
-
* Save batch annotation requests for OpenAI Batch API
|
|
83
|
-
*/
|
|
84
|
-
saveBatchAnnotateRequests(jsonSchema: Record<string, any> | string, documents: string[], batchRequestsPath: string, options?: AnnotationOptions): Promise<void>;
|
|
85
|
-
/**
|
|
86
|
-
* Build dataset from batch API results
|
|
87
|
-
*/
|
|
88
|
-
buildDatasetFromBatchResults(jsonSchema: Record<string, any> | string, batchResultsPath: string, datasetPath: string, modality?: 'native' | 'text'): Promise<void>;
|
|
89
|
-
private createSystemMessage;
|
|
90
|
-
private createUserMessage;
|
|
91
|
-
private createMultiDocumentUserMessage;
|
|
92
|
-
private generateAnnotation;
|
|
93
|
-
private generateAnnotationFromUserMessage;
|
|
94
|
-
}
|
|
95
|
-
export declare class Datasets extends SyncAPIResource {
|
|
96
|
-
private mixin;
|
|
97
|
-
pprint(datasetPath: string, inputTokenPrice?: number, outputTokenPrice?: number): Promise<DatasetMetrics>;
|
|
98
|
-
save(jsonSchema: Record<string, any> | string, documentAnnotationPairsPaths: Array<{
|
|
99
|
-
document: string;
|
|
100
|
-
annotation: string;
|
|
101
|
-
}>, datasetPath: string, options?: SaveOptions): Promise<void>;
|
|
102
|
-
changeSchema(inputDatasetPath: string, jsonSchema: Record<string, any> | string, outputDatasetPath?: string, inplace?: boolean): Promise<void>;
|
|
103
|
-
stitchAndSave(jsonSchema: Record<string, any> | string, pairsPaths: Array<{
|
|
104
|
-
documents: string[];
|
|
105
|
-
annotation: string;
|
|
106
|
-
}>, datasetPath: string, modality?: 'native' | 'text'): Promise<void>;
|
|
107
|
-
annotate(jsonSchema: Record<string, any> | string, documents: string[], datasetPath: string, options?: AnnotationOptions): Promise<void>;
|
|
108
|
-
updateAnnotations(jsonSchema: Record<string, any> | string, oldDatasetPath: string, newDatasetPath: string, options?: AnnotationOptions): Promise<void>;
|
|
109
|
-
saveBatchAnnotateRequests(jsonSchema: Record<string, any> | string, documents: string[], batchRequestsPath: string, options?: AnnotationOptions): Promise<void>;
|
|
110
|
-
buildDatasetFromBatchResults(jsonSchema: Record<string, any> | string, batchResultsPath: string, datasetPath: string, modality?: 'native' | 'text'): Promise<void>;
|
|
111
|
-
}
|
|
112
|
-
export declare class AsyncDatasets extends AsyncAPIResource {
|
|
113
|
-
private mixin;
|
|
114
|
-
pprint(datasetPath: string, inputTokenPrice?: number, outputTokenPrice?: number): Promise<DatasetMetrics>;
|
|
115
|
-
save(jsonSchema: Record<string, any> | string, documentAnnotationPairsPaths: Array<{
|
|
116
|
-
document: string;
|
|
117
|
-
annotation: string;
|
|
118
|
-
}>, datasetPath: string, options?: SaveOptions): Promise<void>;
|
|
119
|
-
changeSchema(inputDatasetPath: string, jsonSchema: Record<string, any> | string, outputDatasetPath?: string, inplace?: boolean): Promise<void>;
|
|
120
|
-
stitchAndSave(jsonSchema: Record<string, any> | string, pairsPaths: Array<{
|
|
121
|
-
documents: string[];
|
|
122
|
-
annotation: string;
|
|
123
|
-
}>, datasetPath: string, modality?: 'native' | 'text'): Promise<void>;
|
|
124
|
-
annotate(jsonSchema: Record<string, any> | string, documents: string[], datasetPath: string, options?: AnnotationOptions): Promise<void>;
|
|
125
|
-
updateAnnotations(jsonSchema: Record<string, any> | string, oldDatasetPath: string, newDatasetPath: string, options?: AnnotationOptions): Promise<void>;
|
|
126
|
-
saveBatchAnnotateRequests(jsonSchema: Record<string, any> | string, documents: string[], batchRequestsPath: string, options?: AnnotationOptions): Promise<void>;
|
|
127
|
-
buildDatasetFromBatchResults(jsonSchema: Record<string, any> | string, batchResultsPath: string, datasetPath: string, modality?: 'native' | 'text'): Promise<void>;
|
|
128
|
-
}
|
|
129
|
-
declare const _default: {
|
|
130
|
-
Datasets: typeof Datasets;
|
|
131
|
-
AsyncDatasets: typeof AsyncDatasets;
|
|
132
|
-
BaseDatasetsMixin: typeof BaseDatasetsMixin;
|
|
133
|
-
};
|
|
134
|
-
export default _default;
|
|
135
|
-
//# sourceMappingURL=datasets.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"datasets.d.ts","sourceRoot":"","sources":["../../src/utils/datasets.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,eAAe,EAAE,gBAAgB,EAAE,MAAM,gBAAgB,CAAC;AAEnE,OAAO,EAAmD,cAAc,EAAE,MAAM,cAAc,CAAC;AAE/F;;;GAGG;AAEH,MAAM,WAAW,cAAc;IAC7B,QAAQ,EAAE,KAAK,CAAC;QACd,IAAI,EAAE,QAAQ,GAAG,MAAM,GAAG,WAAW,CAAC;QACtC,OAAO,EAAE,MAAM,CAAC;KACjB,CAAC,CAAC;CACJ;AAED,MAAM,WAAW,sBAAsB;IACrC,QAAQ,EAAE,MAAM,GAAG,MAAM,CAAC;IAC1B,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;CACjC;AAED,MAAM,WAAW,iBAAiB;IAChC,SAAS,EAAE,MAAM,CAAC;IAClB,MAAM,EAAE,MAAM,CAAC;IACf,GAAG,EAAE,MAAM,CAAC;IACZ,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;CAC3B;AAED,MAAM,WAAW,kBAAkB;IACjC,EAAE,EAAE,MAAM,CAAC;IACX,SAAS,EAAE,MAAM,CAAC;IAClB,QAAQ,EAAE;QACR,WAAW,EAAE,MAAM,CAAC;QACpB,UAAU,EAAE,MAAM,CAAC;QACnB,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;KAC3B,CAAC;IACF,KAAK,CAAC,EAAE;QACN,IAAI,EAAE,MAAM,CAAC;QACb,OAAO,EAAE,MAAM,CAAC;KACjB,CAAC;CACH;AAED,MAAM,WAAW,iBAAiB;IAChC,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,EAAE,QAAQ,GAAG,MAAM,CAAC;IAC7B,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,gBAAgB,CAAC,EAAE,KAAK,GAAG,QAAQ,GAAG,MAAM,CAAC;IAC7C,QAAQ,CAAC,EAAE,QAAQ,GAAG,WAAW,GAAG,KAAK,GAAG,QAAQ,CAAC;IACrD,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AAED,MAAM,WAAW,WAAW;IAC1B,QAAQ,CAAC,EAAE,QAAQ,GAAG,MAAM,CAAC;IAC7B,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,aAAa,CAAC,EAAE,IAAI,GAAG,IAAI,GAAG,IAAI,CAAC;CACpC;AAED,qBAAa,iBAAiB;IAC5B;;OAEG;IACG,MAAM,CACV,WAAW,EAAE,MAAM,EACnB,eAAe,GAAE,MAAgB,EACjC,gBAAgB,GAAE,MAAe,GAChC,OAAO,CAAC,cAAc,CAAC;IAe1B;;OAEG;IACG,IAAI,CACR,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,4BAA4B,EAAE,KAAK,CAAC;QAClC,QAAQ,EAAE,MAAM,CAAC;QACjB,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC,EACF,WAAW,EAAE,MAAM,EACnB,OAAO,GAAE,WAAgB,GACxB,OAAO,CAAC,IAAI,CAAC;IAkChB;;OAEG;IACG,YAAY,CAChB,gBAAgB,EAAE,MAAM,EACxB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,iBAAiB,CAAC,EAAE,MAAM,EAC1B,OAAO,GAAE,OAAe,GACvB,OAAO,CAAC,IAAI,CAAC;IAkChB;;OAEG;IACG,aAAa,CACjB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,UAAU,EAAE,KAAK,CAAC;QAChB,SAAS,EAAE,MAAM,EAAE,CAAC;QACpB,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC,EACF,WAAW,EAAE,MAAM,EACnB,QAAQ,GAAE,QAAQ,GAAG,MAAiB,GACrC,OAAO,CAAC,IAAI,CAAC;IAkChB;;OAEG;IACG,QAAQ,CACZ,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,SAAS,EAAE,MAAM,EAAE,EACnB,WAAW,EAAE,MAAM,EACnB,OAAO,GAAE,iBAAsB,GAC9B,OAAO,CAAC,IAAI,CAAC;IAyDhB;;OAEG;IACG,iBAAiB,CACrB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,cAAc,EAAE,MAAM,EACtB,cAAc,EAAE,MAAM,EACtB,OAAO,GAAE,iBAAsB,GAC9B,OAAO,CAAC,IAAI,CAAC;IAoDhB;;OAEG;IACG,yBAAyB,CAC7B,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,SAAS,EAAE,MAAM,EAAE,EACnB,iBAAiB,EAAE,MAAM,EACzB,OAAO,GAAE,iBAAsB,GAC9B,OAAO,CAAC,IAAI,CAAC;IA+BhB;;OAEG;IACG,4BAA4B,CAChC,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,gBAAgB,EAAE,MAAM,EACxB,WAAW,EAAE,MAAM,EACnB,QAAQ,GAAE,QAAQ,GAAG,MAAiB,GACrC,OAAO,CAAC,IAAI,CAAC;IAoDhB,OAAO,CAAC,mBAAmB;YAab,iBAAiB;YAejB,8BAA8B;YAe9B,kBAAkB;YAclB,iCAAiC;CAQhD;AAED,qBAAa,QAAS,SAAQ,eAAe;IAC3C,OAAO,CAAC,KAAK,CAA2B;IAElC,MAAM,CACV,WAAW,EAAE,MAAM,EACnB,eAAe,CAAC,EAAE,MAAM,EACxB,gBAAgB,CAAC,EAAE,MAAM,GACxB,OAAO,CAAC,cAAc,CAAC;IAIpB,IAAI,CACR,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,4BAA4B,EAAE,KAAK,CAAC;QAAE,QAAQ,EAAE,MAAM,CAAC;QAAC,UAAU,EAAE,MAAM,CAAA;KAAE,CAAC,EAC7E,WAAW,EAAE,MAAM,EACnB,OAAO,CAAC,EAAE,WAAW,GACpB,OAAO,CAAC,IAAI,CAAC;IAIV,YAAY,CAChB,gBAAgB,EAAE,MAAM,EACxB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,iBAAiB,CAAC,EAAE,MAAM,EAC1B,OAAO,CAAC,EAAE,OAAO,GAChB,OAAO,CAAC,IAAI,CAAC;IAIV,aAAa,CACjB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,UAAU,EAAE,KAAK,CAAC;QAAE,SAAS,EAAE,MAAM,EAAE,CAAC;QAAC,UAAU,EAAE,MAAM,CAAA;KAAE,CAAC,EAC9D,WAAW,EAAE,MAAM,EACnB,QAAQ,CAAC,EAAE,QAAQ,GAAG,MAAM,GAC3B,OAAO,CAAC,IAAI,CAAC;IAIV,QAAQ,CACZ,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,SAAS,EAAE,MAAM,EAAE,EACnB,WAAW,EAAE,MAAM,EACnB,OAAO,CAAC,EAAE,iBAAiB,GAC1B,OAAO,CAAC,IAAI,CAAC;IAIV,iBAAiB,CACrB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,cAAc,EAAE,MAAM,EACtB,cAAc,EAAE,MAAM,EACtB,OAAO,CAAC,EAAE,iBAAiB,GAC1B,OAAO,CAAC,IAAI,CAAC;IAIV,yBAAyB,CAC7B,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,SAAS,EAAE,MAAM,EAAE,EACnB,iBAAiB,EAAE,MAAM,EACzB,OAAO,CAAC,EAAE,iBAAiB,GAC1B,OAAO,CAAC,IAAI,CAAC;IAIV,4BAA4B,CAChC,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,gBAAgB,EAAE,MAAM,EACxB,WAAW,EAAE,MAAM,EACnB,QAAQ,CAAC,EAAE,QAAQ,GAAG,MAAM,GAC3B,OAAO,CAAC,IAAI,CAAC;CAGjB;AAED,qBAAa,aAAc,SAAQ,gBAAgB;IACjD,OAAO,CAAC,KAAK,CAA2B;IAElC,MAAM,CACV,WAAW,EAAE,MAAM,EACnB,eAAe,CAAC,EAAE,MAAM,EACxB,gBAAgB,CAAC,EAAE,MAAM,GACxB,OAAO,CAAC,cAAc,CAAC;IAIpB,IAAI,CACR,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,4BAA4B,EAAE,KAAK,CAAC;QAAE,QAAQ,EAAE,MAAM,CAAC;QAAC,UAAU,EAAE,MAAM,CAAA;KAAE,CAAC,EAC7E,WAAW,EAAE,MAAM,EACnB,OAAO,CAAC,EAAE,WAAW,GACpB,OAAO,CAAC,IAAI,CAAC;IAIV,YAAY,CAChB,gBAAgB,EAAE,MAAM,EACxB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,iBAAiB,CAAC,EAAE,MAAM,EAC1B,OAAO,CAAC,EAAE,OAAO,GAChB,OAAO,CAAC,IAAI,CAAC;IAIV,aAAa,CACjB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,UAAU,EAAE,KAAK,CAAC;QAAE,SAAS,EAAE,MAAM,EAAE,CAAC;QAAC,UAAU,EAAE,MAAM,CAAA;KAAE,CAAC,EAC9D,WAAW,EAAE,MAAM,EACnB,QAAQ,CAAC,EAAE,QAAQ,GAAG,MAAM,GAC3B,OAAO,CAAC,IAAI,CAAC;IAIV,QAAQ,CACZ,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,SAAS,EAAE,MAAM,EAAE,EACnB,WAAW,EAAE,MAAM,EACnB,OAAO,CAAC,EAAE,iBAAiB,GAC1B,OAAO,CAAC,IAAI,CAAC;IAIV,iBAAiB,CACrB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,cAAc,EAAE,MAAM,EACtB,cAAc,EAAE,MAAM,EACtB,OAAO,CAAC,EAAE,iBAAiB,GAC1B,OAAO,CAAC,IAAI,CAAC;IAIV,yBAAyB,CAC7B,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,SAAS,EAAE,MAAM,EAAE,EACnB,iBAAiB,EAAE,MAAM,EACzB,OAAO,CAAC,EAAE,iBAAiB,GAC1B,OAAO,CAAC,IAAI,CAAC;IAIV,4BAA4B,CAChC,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,gBAAgB,EAAE,MAAM,EACxB,WAAW,EAAE,MAAM,EACnB,QAAQ,CAAC,EAAE,QAAQ,GAAG,MAAM,GAC3B,OAAO,CAAC,IAAI,CAAC;CAGjB;;;;;;AAED,wBAIE"}
|
package/dist/utils/datasets.js
DELETED
|
@@ -1,359 +0,0 @@
|
|
|
1
|
-
import fs from 'fs';
|
|
2
|
-
import path from 'path';
|
|
3
|
-
import { SyncAPIResource, AsyncAPIResource } from '../resource.js';
|
|
4
|
-
import { readJSONL, writeJSONL } from './jsonl.js';
|
|
5
|
-
import { displayMetrics, processDatasetAndComputeMetrics } from './display.js';
|
|
6
|
-
export class BaseDatasetsMixin {
|
|
7
|
-
/**
|
|
8
|
-
* Process dataset and compute comprehensive metrics
|
|
9
|
-
*/
|
|
10
|
-
async pprint(datasetPath, inputTokenPrice = 0.00015, outputTokenPrice = 0.0006) {
|
|
11
|
-
if (!fs.existsSync(datasetPath)) {
|
|
12
|
-
throw new Error(`Dataset file not found: ${datasetPath}`);
|
|
13
|
-
}
|
|
14
|
-
const metrics = await processDatasetAndComputeMetrics(datasetPath, inputTokenPrice, outputTokenPrice);
|
|
15
|
-
displayMetrics(metrics);
|
|
16
|
-
return metrics;
|
|
17
|
-
}
|
|
18
|
-
/**
|
|
19
|
-
* Save document-annotation pairs as JSONL training dataset
|
|
20
|
-
*/
|
|
21
|
-
async save(jsonSchema, documentAnnotationPairsPaths, datasetPath, options = {}) {
|
|
22
|
-
const { modality = 'native' } = options;
|
|
23
|
-
const finetuningData = [];
|
|
24
|
-
for (const { document: docPath, annotation: annPath } of documentAnnotationPairsPaths) {
|
|
25
|
-
// Read document and annotation
|
|
26
|
-
if (!fs.existsSync(docPath) || !fs.existsSync(annPath)) {
|
|
27
|
-
throw new Error(`Document or annotation file not found: ${docPath}, ${annPath}`);
|
|
28
|
-
}
|
|
29
|
-
const annotation = JSON.parse(fs.readFileSync(annPath, 'utf-8'));
|
|
30
|
-
// Create system message with schema
|
|
31
|
-
const systemMessage = this.createSystemMessage(jsonSchema, modality);
|
|
32
|
-
// Create user message with document
|
|
33
|
-
const userMessage = await this.createUserMessage(docPath, modality, options);
|
|
34
|
-
// Create assistant message with annotation
|
|
35
|
-
const assistantMessage = {
|
|
36
|
-
role: 'assistant',
|
|
37
|
-
content: JSON.stringify(annotation),
|
|
38
|
-
};
|
|
39
|
-
finetuningData.push({
|
|
40
|
-
messages: [systemMessage, userMessage, assistantMessage],
|
|
41
|
-
});
|
|
42
|
-
}
|
|
43
|
-
// Write to JSONL file
|
|
44
|
-
await writeJSONL(datasetPath, finetuningData);
|
|
45
|
-
console.log(`✅ Dataset saved to ${datasetPath} with ${finetuningData.length} examples`);
|
|
46
|
-
}
|
|
47
|
-
/**
|
|
48
|
-
* Change schema in existing dataset
|
|
49
|
-
*/
|
|
50
|
-
async changeSchema(inputDatasetPath, jsonSchema, outputDatasetPath, inplace = false) {
|
|
51
|
-
if (!fs.existsSync(inputDatasetPath)) {
|
|
52
|
-
throw new Error(`Input dataset not found: ${inputDatasetPath}`);
|
|
53
|
-
}
|
|
54
|
-
const outputPath = inplace ? inputDatasetPath : (outputDatasetPath || inputDatasetPath);
|
|
55
|
-
const tempPath = `${outputPath}.tmp`;
|
|
56
|
-
try {
|
|
57
|
-
const dataset = await readJSONL(inputDatasetPath);
|
|
58
|
-
const newSystemMessage = this.createSystemMessage(jsonSchema, 'native');
|
|
59
|
-
const updatedDataset = dataset.map((item) => ({
|
|
60
|
-
...item,
|
|
61
|
-
messages: [
|
|
62
|
-
newSystemMessage,
|
|
63
|
-
...item.messages.slice(1), // Keep user and assistant messages
|
|
64
|
-
],
|
|
65
|
-
}));
|
|
66
|
-
await writeJSONL(tempPath, updatedDataset);
|
|
67
|
-
// Atomic move
|
|
68
|
-
fs.renameSync(tempPath, outputPath);
|
|
69
|
-
console.log(`✅ Schema updated in ${outputPath}`);
|
|
70
|
-
}
|
|
71
|
-
catch (error) {
|
|
72
|
-
// Cleanup temp file on error
|
|
73
|
-
if (fs.existsSync(tempPath)) {
|
|
74
|
-
fs.unlinkSync(tempPath);
|
|
75
|
-
}
|
|
76
|
-
throw error;
|
|
77
|
-
}
|
|
78
|
-
}
|
|
79
|
-
/**
|
|
80
|
-
* Stitch multiple documents and save as dataset
|
|
81
|
-
*/
|
|
82
|
-
async stitchAndSave(jsonSchema, pairsPaths, datasetPath, modality = 'native') {
|
|
83
|
-
const finetuningData = [];
|
|
84
|
-
for (const { documents: docPaths, annotation: annPath } of pairsPaths) {
|
|
85
|
-
if (!fs.existsSync(annPath)) {
|
|
86
|
-
throw new Error(`Annotation file not found: ${annPath}`);
|
|
87
|
-
}
|
|
88
|
-
// Verify all document files exist
|
|
89
|
-
for (const docPath of docPaths) {
|
|
90
|
-
if (!fs.existsSync(docPath)) {
|
|
91
|
-
throw new Error(`Document file not found: ${docPath}`);
|
|
92
|
-
}
|
|
93
|
-
}
|
|
94
|
-
const annotation = JSON.parse(fs.readFileSync(annPath, 'utf-8'));
|
|
95
|
-
const systemMessage = this.createSystemMessage(jsonSchema, modality);
|
|
96
|
-
const userMessage = await this.createMultiDocumentUserMessage(docPaths, modality);
|
|
97
|
-
const assistantMessage = {
|
|
98
|
-
role: 'assistant',
|
|
99
|
-
content: JSON.stringify(annotation),
|
|
100
|
-
};
|
|
101
|
-
finetuningData.push({
|
|
102
|
-
messages: [systemMessage, userMessage, assistantMessage],
|
|
103
|
-
});
|
|
104
|
-
}
|
|
105
|
-
await writeJSONL(datasetPath, finetuningData);
|
|
106
|
-
console.log(`✅ Stitched dataset saved to ${datasetPath} with ${finetuningData.length} examples`);
|
|
107
|
-
}
|
|
108
|
-
/**
|
|
109
|
-
* Generate annotations for documents using AI models
|
|
110
|
-
*/
|
|
111
|
-
async annotate(jsonSchema, documents, datasetPath, options = {}) {
|
|
112
|
-
const { model = 'gpt-4o-mini', temperature = 0.0, modality = 'native', maxConcurrency = 5, reasoning_effort = 'medium', provider = 'openai', } = options;
|
|
113
|
-
console.log(`🚀 Starting annotation of ${documents.length} documents...`);
|
|
114
|
-
const finetuningData = [];
|
|
115
|
-
const concurrencyLimit = Math.min(maxConcurrency, documents.length);
|
|
116
|
-
// Process documents in batches
|
|
117
|
-
for (let i = 0; i < documents.length; i += concurrencyLimit) {
|
|
118
|
-
const batch = documents.slice(i, i + concurrencyLimit);
|
|
119
|
-
const batchPromises = batch.map(async (docPath, index) => {
|
|
120
|
-
const globalIndex = i + index;
|
|
121
|
-
console.log(`📝 Processing document ${globalIndex + 1}/${documents.length}: ${path.basename(docPath)}`);
|
|
122
|
-
try {
|
|
123
|
-
const annotation = await this.generateAnnotation(jsonSchema, docPath, model, temperature, modality, reasoning_effort, provider);
|
|
124
|
-
const systemMessage = this.createSystemMessage(jsonSchema, modality);
|
|
125
|
-
const userMessage = await this.createUserMessage(docPath, modality);
|
|
126
|
-
const assistantMessage = {
|
|
127
|
-
role: 'assistant',
|
|
128
|
-
content: JSON.stringify(annotation),
|
|
129
|
-
};
|
|
130
|
-
return {
|
|
131
|
-
messages: [systemMessage, userMessage, assistantMessage],
|
|
132
|
-
};
|
|
133
|
-
}
|
|
134
|
-
catch (error) {
|
|
135
|
-
console.error(`❌ Failed to process ${docPath}:`, error);
|
|
136
|
-
return null;
|
|
137
|
-
}
|
|
138
|
-
});
|
|
139
|
-
const batchResults = await Promise.all(batchPromises);
|
|
140
|
-
finetuningData.push(...batchResults.filter(result => result !== null));
|
|
141
|
-
}
|
|
142
|
-
await writeJSONL(datasetPath, finetuningData);
|
|
143
|
-
console.log(`✅ Annotation complete! Generated ${finetuningData.length}/${documents.length} annotations`);
|
|
144
|
-
}
|
|
145
|
-
/**
|
|
146
|
-
* Update existing annotations with new model/schema
|
|
147
|
-
*/
|
|
148
|
-
async updateAnnotations(jsonSchema, oldDatasetPath, newDatasetPath, options = {}) {
|
|
149
|
-
if (!fs.existsSync(oldDatasetPath)) {
|
|
150
|
-
throw new Error(`Old dataset not found: ${oldDatasetPath}`);
|
|
151
|
-
}
|
|
152
|
-
console.log(`🔄 Updating annotations from ${oldDatasetPath}...`);
|
|
153
|
-
const oldDataset = await readJSONL(oldDatasetPath);
|
|
154
|
-
const updatedDataset = [];
|
|
155
|
-
for (let i = 0; i < oldDataset.length; i++) {
|
|
156
|
-
const item = oldDataset[i];
|
|
157
|
-
console.log(`🔄 Updating annotation ${i + 1}/${oldDataset.length}`);
|
|
158
|
-
try {
|
|
159
|
-
// Extract document path from user message (this is simplified)
|
|
160
|
-
const userContent = item.messages.find(m => m.role === 'user')?.content;
|
|
161
|
-
if (!userContent) {
|
|
162
|
-
console.warn(`⚠️ No user message found in item ${i + 1}, skipping`);
|
|
163
|
-
continue;
|
|
164
|
-
}
|
|
165
|
-
// For this implementation, we assume the document is referenced in the user message
|
|
166
|
-
// In practice, you'd need to store document paths or reconstruct them
|
|
167
|
-
const newAnnotation = await this.generateAnnotationFromUserMessage(jsonSchema, userContent, options);
|
|
168
|
-
const systemMessage = this.createSystemMessage(jsonSchema, options.modality || 'native');
|
|
169
|
-
const assistantMessage = {
|
|
170
|
-
role: 'assistant',
|
|
171
|
-
content: JSON.stringify(newAnnotation),
|
|
172
|
-
};
|
|
173
|
-
updatedDataset.push({
|
|
174
|
-
messages: [
|
|
175
|
-
systemMessage,
|
|
176
|
-
item.messages.find(m => m.role === 'user'),
|
|
177
|
-
assistantMessage,
|
|
178
|
-
],
|
|
179
|
-
});
|
|
180
|
-
}
|
|
181
|
-
catch (error) {
|
|
182
|
-
console.error(`❌ Failed to update annotation ${i + 1}:`, error);
|
|
183
|
-
}
|
|
184
|
-
}
|
|
185
|
-
await writeJSONL(newDatasetPath, updatedDataset);
|
|
186
|
-
console.log(`✅ Updated ${updatedDataset.length}/${oldDataset.length} annotations`);
|
|
187
|
-
}
|
|
188
|
-
/**
|
|
189
|
-
* Save batch annotation requests for OpenAI Batch API
|
|
190
|
-
*/
|
|
191
|
-
async saveBatchAnnotateRequests(jsonSchema, documents, batchRequestsPath, options = {}) {
|
|
192
|
-
const { model = 'gpt-4o-mini', temperature = 0.0, modality = 'native', } = options;
|
|
193
|
-
const batchRequests = [];
|
|
194
|
-
for (let i = 0; i < documents.length; i++) {
|
|
195
|
-
const docPath = documents[i];
|
|
196
|
-
const systemMessage = this.createSystemMessage(jsonSchema, modality);
|
|
197
|
-
const userMessage = await this.createUserMessage(docPath, modality);
|
|
198
|
-
batchRequests.push({
|
|
199
|
-
custom_id: `doc_${i}_${path.basename(docPath, path.extname(docPath))}`,
|
|
200
|
-
method: 'POST',
|
|
201
|
-
url: '/v1/chat/completions',
|
|
202
|
-
body: {
|
|
203
|
-
model,
|
|
204
|
-
messages: [systemMessage, userMessage],
|
|
205
|
-
temperature,
|
|
206
|
-
response_format: { type: 'json_object' },
|
|
207
|
-
},
|
|
208
|
-
});
|
|
209
|
-
}
|
|
210
|
-
await writeJSONL(batchRequestsPath, batchRequests);
|
|
211
|
-
console.log(`✅ Saved ${batchRequests.length} batch requests to ${batchRequestsPath}`);
|
|
212
|
-
}
|
|
213
|
-
/**
|
|
214
|
-
* Build dataset from batch API results
|
|
215
|
-
*/
|
|
216
|
-
async buildDatasetFromBatchResults(jsonSchema, batchResultsPath, datasetPath, modality = 'native') {
|
|
217
|
-
if (!fs.existsSync(batchResultsPath)) {
|
|
218
|
-
throw new Error(`Batch results file not found: ${batchResultsPath}`);
|
|
219
|
-
}
|
|
220
|
-
const batchResults = await readJSONL(batchResultsPath);
|
|
221
|
-
const finetuningData = [];
|
|
222
|
-
for (const result of batchResults) {
|
|
223
|
-
if (result.error) {
|
|
224
|
-
console.warn(`⚠️ Skipping failed request ${result.custom_id}: ${result.error.message}`);
|
|
225
|
-
continue;
|
|
226
|
-
}
|
|
227
|
-
const response = result.response.body;
|
|
228
|
-
const content = response.choices?.[0]?.message?.content;
|
|
229
|
-
if (!content) {
|
|
230
|
-
console.warn(`⚠️ No content in response for ${result.custom_id}`);
|
|
231
|
-
continue;
|
|
232
|
-
}
|
|
233
|
-
try {
|
|
234
|
-
const annotation = JSON.parse(content);
|
|
235
|
-
// Reconstruct messages (this is simplified)
|
|
236
|
-
const systemMessage = this.createSystemMessage(jsonSchema, modality);
|
|
237
|
-
// Extract user message from original request (would need to be stored)
|
|
238
|
-
const userMessage = {
|
|
239
|
-
role: 'user',
|
|
240
|
-
content: `Document content for ${result.custom_id}`,
|
|
241
|
-
};
|
|
242
|
-
const assistantMessage = {
|
|
243
|
-
role: 'assistant',
|
|
244
|
-
content: JSON.stringify(annotation),
|
|
245
|
-
};
|
|
246
|
-
finetuningData.push({
|
|
247
|
-
messages: [systemMessage, userMessage, assistantMessage],
|
|
248
|
-
});
|
|
249
|
-
}
|
|
250
|
-
catch (error) {
|
|
251
|
-
console.warn(`⚠️ Failed to parse annotation for ${result.custom_id}:`, error);
|
|
252
|
-
}
|
|
253
|
-
}
|
|
254
|
-
await writeJSONL(datasetPath, finetuningData);
|
|
255
|
-
console.log(`✅ Built dataset with ${finetuningData.length} examples from batch results`);
|
|
256
|
-
}
|
|
257
|
-
// Helper methods
|
|
258
|
-
createSystemMessage(jsonSchema, _modality) {
|
|
259
|
-
const schemaObj = typeof jsonSchema === 'string' ? JSON.parse(jsonSchema) : jsonSchema;
|
|
260
|
-
const schemaStr = JSON.stringify(schemaObj, null, 2);
|
|
261
|
-
return {
|
|
262
|
-
role: 'system',
|
|
263
|
-
content: `You are an expert data extraction assistant. Extract information from the provided document according to the following JSON schema:\n\n${schemaStr}\n\nReturn only valid JSON that matches the schema exactly.`,
|
|
264
|
-
};
|
|
265
|
-
}
|
|
266
|
-
async createUserMessage(docPath, _modality, _options = {}) {
|
|
267
|
-
// This is a simplified implementation
|
|
268
|
-
// In practice, you'd handle different file types, base64 encoding, etc.
|
|
269
|
-
const content = fs.readFileSync(docPath, 'utf-8');
|
|
270
|
-
return {
|
|
271
|
-
role: 'user',
|
|
272
|
-
content: `Please extract data from this document:\n\n${content}`,
|
|
273
|
-
};
|
|
274
|
-
}
|
|
275
|
-
async createMultiDocumentUserMessage(docPaths, _modality) {
|
|
276
|
-
const contents = docPaths.map((docPath, index) => {
|
|
277
|
-
const content = fs.readFileSync(docPath, 'utf-8');
|
|
278
|
-
return `Document ${index + 1} (${path.basename(docPath)}):\n${content}`;
|
|
279
|
-
}).join('\n\n---\n\n');
|
|
280
|
-
return {
|
|
281
|
-
role: 'user',
|
|
282
|
-
content: `Please extract data from these documents:\n\n${contents}`,
|
|
283
|
-
};
|
|
284
|
-
}
|
|
285
|
-
async generateAnnotation(_jsonSchema, _docPath, _model, _temperature, _modality, _reasoningEffort, _provider) {
|
|
286
|
-
// This would integrate with the actual AI providers
|
|
287
|
-
// For now, return a placeholder implementation
|
|
288
|
-
throw new Error('AI provider integration not implemented in this version');
|
|
289
|
-
}
|
|
290
|
-
async generateAnnotationFromUserMessage(_jsonSchema, _userContent, _options) {
|
|
291
|
-
// This would re-generate annotation from existing user message
|
|
292
|
-
throw new Error('Annotation update not implemented in this version');
|
|
293
|
-
}
|
|
294
|
-
}
|
|
295
|
-
export class Datasets extends SyncAPIResource {
|
|
296
|
-
constructor() {
|
|
297
|
-
super(...arguments);
|
|
298
|
-
this.mixin = new BaseDatasetsMixin();
|
|
299
|
-
}
|
|
300
|
-
async pprint(datasetPath, inputTokenPrice, outputTokenPrice) {
|
|
301
|
-
return this.mixin.pprint(datasetPath, inputTokenPrice, outputTokenPrice);
|
|
302
|
-
}
|
|
303
|
-
async save(jsonSchema, documentAnnotationPairsPaths, datasetPath, options) {
|
|
304
|
-
return this.mixin.save(jsonSchema, documentAnnotationPairsPaths, datasetPath, options);
|
|
305
|
-
}
|
|
306
|
-
async changeSchema(inputDatasetPath, jsonSchema, outputDatasetPath, inplace) {
|
|
307
|
-
return this.mixin.changeSchema(inputDatasetPath, jsonSchema, outputDatasetPath, inplace);
|
|
308
|
-
}
|
|
309
|
-
async stitchAndSave(jsonSchema, pairsPaths, datasetPath, modality) {
|
|
310
|
-
return this.mixin.stitchAndSave(jsonSchema, pairsPaths, datasetPath, modality);
|
|
311
|
-
}
|
|
312
|
-
async annotate(jsonSchema, documents, datasetPath, options) {
|
|
313
|
-
return this.mixin.annotate(jsonSchema, documents, datasetPath, options);
|
|
314
|
-
}
|
|
315
|
-
async updateAnnotations(jsonSchema, oldDatasetPath, newDatasetPath, options) {
|
|
316
|
-
return this.mixin.updateAnnotations(jsonSchema, oldDatasetPath, newDatasetPath, options);
|
|
317
|
-
}
|
|
318
|
-
async saveBatchAnnotateRequests(jsonSchema, documents, batchRequestsPath, options) {
|
|
319
|
-
return this.mixin.saveBatchAnnotateRequests(jsonSchema, documents, batchRequestsPath, options);
|
|
320
|
-
}
|
|
321
|
-
async buildDatasetFromBatchResults(jsonSchema, batchResultsPath, datasetPath, modality) {
|
|
322
|
-
return this.mixin.buildDatasetFromBatchResults(jsonSchema, batchResultsPath, datasetPath, modality);
|
|
323
|
-
}
|
|
324
|
-
}
|
|
325
|
-
export class AsyncDatasets extends AsyncAPIResource {
|
|
326
|
-
constructor() {
|
|
327
|
-
super(...arguments);
|
|
328
|
-
this.mixin = new BaseDatasetsMixin();
|
|
329
|
-
}
|
|
330
|
-
async pprint(datasetPath, inputTokenPrice, outputTokenPrice) {
|
|
331
|
-
return this.mixin.pprint(datasetPath, inputTokenPrice, outputTokenPrice);
|
|
332
|
-
}
|
|
333
|
-
async save(jsonSchema, documentAnnotationPairsPaths, datasetPath, options) {
|
|
334
|
-
return this.mixin.save(jsonSchema, documentAnnotationPairsPaths, datasetPath, options);
|
|
335
|
-
}
|
|
336
|
-
async changeSchema(inputDatasetPath, jsonSchema, outputDatasetPath, inplace) {
|
|
337
|
-
return this.mixin.changeSchema(inputDatasetPath, jsonSchema, outputDatasetPath, inplace);
|
|
338
|
-
}
|
|
339
|
-
async stitchAndSave(jsonSchema, pairsPaths, datasetPath, modality) {
|
|
340
|
-
return this.mixin.stitchAndSave(jsonSchema, pairsPaths, datasetPath, modality);
|
|
341
|
-
}
|
|
342
|
-
async annotate(jsonSchema, documents, datasetPath, options) {
|
|
343
|
-
return this.mixin.annotate(jsonSchema, documents, datasetPath, options);
|
|
344
|
-
}
|
|
345
|
-
async updateAnnotations(jsonSchema, oldDatasetPath, newDatasetPath, options) {
|
|
346
|
-
return this.mixin.updateAnnotations(jsonSchema, oldDatasetPath, newDatasetPath, options);
|
|
347
|
-
}
|
|
348
|
-
async saveBatchAnnotateRequests(jsonSchema, documents, batchRequestsPath, options) {
|
|
349
|
-
return this.mixin.saveBatchAnnotateRequests(jsonSchema, documents, batchRequestsPath, options);
|
|
350
|
-
}
|
|
351
|
-
async buildDatasetFromBatchResults(jsonSchema, batchResultsPath, datasetPath, modality) {
|
|
352
|
-
return this.mixin.buildDatasetFromBatchResults(jsonSchema, batchResultsPath, datasetPath, modality);
|
|
353
|
-
}
|
|
354
|
-
}
|
|
355
|
-
export default {
|
|
356
|
-
Datasets,
|
|
357
|
-
AsyncDatasets,
|
|
358
|
-
BaseDatasetsMixin,
|
|
359
|
-
};
|
package/dist/utils/display.d.ts
DELETED
|
@@ -1,108 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Rich display and visualization utilities for datasets and metrics
|
|
3
|
-
* Equivalent to Python's display.py
|
|
4
|
-
*/
|
|
5
|
-
export interface DatasetMetrics {
|
|
6
|
-
totalExamples: number;
|
|
7
|
-
inputTokens: {
|
|
8
|
-
total: number;
|
|
9
|
-
min: number;
|
|
10
|
-
max: number;
|
|
11
|
-
mean: number;
|
|
12
|
-
median: number;
|
|
13
|
-
p95: number;
|
|
14
|
-
p99: number;
|
|
15
|
-
};
|
|
16
|
-
outputTokens: {
|
|
17
|
-
total: number;
|
|
18
|
-
min: number;
|
|
19
|
-
max: number;
|
|
20
|
-
mean: number;
|
|
21
|
-
median: number;
|
|
22
|
-
p95: number;
|
|
23
|
-
p99: number;
|
|
24
|
-
};
|
|
25
|
-
totalTokens: {
|
|
26
|
-
total: number;
|
|
27
|
-
min: number;
|
|
28
|
-
max: number;
|
|
29
|
-
mean: number;
|
|
30
|
-
median: number;
|
|
31
|
-
};
|
|
32
|
-
estimatedCost: {
|
|
33
|
-
input: number;
|
|
34
|
-
output: number;
|
|
35
|
-
total: number;
|
|
36
|
-
};
|
|
37
|
-
messageStats: {
|
|
38
|
-
systemMessages: number;
|
|
39
|
-
userMessages: number;
|
|
40
|
-
assistantMessages: number;
|
|
41
|
-
avgMessagesPerExample: number;
|
|
42
|
-
};
|
|
43
|
-
contentAnalysis: {
|
|
44
|
-
avgSystemLength: number;
|
|
45
|
-
avgUserLength: number;
|
|
46
|
-
avgAssistantLength: number;
|
|
47
|
-
hasImages: boolean;
|
|
48
|
-
imageCount: number;
|
|
49
|
-
};
|
|
50
|
-
}
|
|
51
|
-
export interface TokenCountResult {
|
|
52
|
-
textTokens: number;
|
|
53
|
-
imageTokens: number;
|
|
54
|
-
totalTokens: number;
|
|
55
|
-
}
|
|
56
|
-
/**
|
|
57
|
-
* Count tokens in text using a simple approximation
|
|
58
|
-
* In production, you'd want to use tiktoken equivalent for JavaScript
|
|
59
|
-
*/
|
|
60
|
-
export declare function countTokens(text: string, _model?: string): number;
|
|
61
|
-
/**
|
|
62
|
-
* Count tokens in content (text + images)
|
|
63
|
-
*/
|
|
64
|
-
export declare function countContentTokens(content: string, _model?: string): TokenCountResult;
|
|
65
|
-
/**
|
|
66
|
-
* Calculate statistical metrics for an array of numbers
|
|
67
|
-
*/
|
|
68
|
-
export declare function calculateStats(values: number[]): {
|
|
69
|
-
min: number;
|
|
70
|
-
max: number;
|
|
71
|
-
mean: number;
|
|
72
|
-
median: number;
|
|
73
|
-
p95: number;
|
|
74
|
-
p99: number;
|
|
75
|
-
total: number;
|
|
76
|
-
};
|
|
77
|
-
/**
|
|
78
|
-
* Process dataset and compute comprehensive metrics
|
|
79
|
-
*/
|
|
80
|
-
export declare function processDatasetAndComputeMetrics(datasetPath: string, inputTokenPrice?: number, outputTokenPrice?: number, model?: string): Promise<DatasetMetrics>;
|
|
81
|
-
/**
|
|
82
|
-
* Display metrics in a formatted table
|
|
83
|
-
*/
|
|
84
|
-
export declare function displayMetrics(metrics: DatasetMetrics): void;
|
|
85
|
-
/**
|
|
86
|
-
* Format large numbers with appropriate units
|
|
87
|
-
*/
|
|
88
|
-
export declare function formatNumber(num: number): string;
|
|
89
|
-
/**
|
|
90
|
-
* Create a simple ASCII progress bar
|
|
91
|
-
*/
|
|
92
|
-
export declare function createProgressBar(current: number, total: number, width?: number): string;
|
|
93
|
-
/**
|
|
94
|
-
* Display progress with a progress bar
|
|
95
|
-
*/
|
|
96
|
-
export declare function displayProgress(current: number, total: number, message?: string): void;
|
|
97
|
-
declare const _default: {
|
|
98
|
-
processDatasetAndComputeMetrics: typeof processDatasetAndComputeMetrics;
|
|
99
|
-
displayMetrics: typeof displayMetrics;
|
|
100
|
-
countTokens: typeof countTokens;
|
|
101
|
-
countContentTokens: typeof countContentTokens;
|
|
102
|
-
calculateStats: typeof calculateStats;
|
|
103
|
-
formatNumber: typeof formatNumber;
|
|
104
|
-
createProgressBar: typeof createProgressBar;
|
|
105
|
-
displayProgress: typeof displayProgress;
|
|
106
|
-
};
|
|
107
|
-
export default _default;
|
|
108
|
-
//# sourceMappingURL=display.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"display.d.ts","sourceRoot":"","sources":["../../src/utils/display.ts"],"names":[],"mappings":"AAGA;;;GAGG;AAEH,MAAM,WAAW,cAAc;IAC7B,aAAa,EAAE,MAAM,CAAC;IACtB,WAAW,EAAE;QACX,KAAK,EAAE,MAAM,CAAC;QACd,GAAG,EAAE,MAAM,CAAC;QACZ,GAAG,EAAE,MAAM,CAAC;QACZ,IAAI,EAAE,MAAM,CAAC;QACb,MAAM,EAAE,MAAM,CAAC;QACf,GAAG,EAAE,MAAM,CAAC;QACZ,GAAG,EAAE,MAAM,CAAC;KACb,CAAC;IACF,YAAY,EAAE;QACZ,KAAK,EAAE,MAAM,CAAC;QACd,GAAG,EAAE,MAAM,CAAC;QACZ,GAAG,EAAE,MAAM,CAAC;QACZ,IAAI,EAAE,MAAM,CAAC;QACb,MAAM,EAAE,MAAM,CAAC;QACf,GAAG,EAAE,MAAM,CAAC;QACZ,GAAG,EAAE,MAAM,CAAC;KACb,CAAC;IACF,WAAW,EAAE;QACX,KAAK,EAAE,MAAM,CAAC;QACd,GAAG,EAAE,MAAM,CAAC;QACZ,GAAG,EAAE,MAAM,CAAC;QACZ,IAAI,EAAE,MAAM,CAAC;QACb,MAAM,EAAE,MAAM,CAAC;KAChB,CAAC;IACF,aAAa,EAAE;QACb,KAAK,EAAE,MAAM,CAAC;QACd,MAAM,EAAE,MAAM,CAAC;QACf,KAAK,EAAE,MAAM,CAAC;KACf,CAAC;IACF,YAAY,EAAE;QACZ,cAAc,EAAE,MAAM,CAAC;QACvB,YAAY,EAAE,MAAM,CAAC;QACrB,iBAAiB,EAAE,MAAM,CAAC;QAC1B,qBAAqB,EAAE,MAAM,CAAC;KAC/B,CAAC;IACF,eAAe,EAAE;QACf,eAAe,EAAE,MAAM,CAAC;QACxB,aAAa,EAAE,MAAM,CAAC;QACtB,kBAAkB,EAAE,MAAM,CAAC;QAC3B,SAAS,EAAE,OAAO,CAAC;QACnB,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC;CACH;AAED,MAAM,WAAW,gBAAgB;IAC/B,UAAU,EAAE,MAAM,CAAC;IACnB,WAAW,EAAE,MAAM,CAAC;IACpB,WAAW,EAAE,MAAM,CAAC;CACrB;AAED;;;GAGG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,EAAE,MAAM,GAAE,MAAsB,GAAG,MAAM,CAKhF;AAED;;GAEG;AACH,wBAAgB,kBAAkB,CAAC,OAAO,EAAE,MAAM,EAAE,MAAM,GAAE,MAAsB,GAAG,gBAAgB,CA+BpG;AAED;;GAEG;AACH,wBAAgB,cAAc,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG;IAChD,GAAG,EAAE,MAAM,CAAC;IACZ,GAAG,EAAE,MAAM,CAAC;IACZ,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,MAAM,CAAC;IACf,GAAG,EAAE,MAAM,CAAC;IACZ,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,CAAC;CACf,CAuBA;AAED;;GAEG;AACH,wBAAsB,+BAA+B,CACnD,WAAW,EAAE,MAAM,EACnB,eAAe,GAAE,MAAgB,EACjC,gBAAgB,GAAE,MAAe,EACjC,KAAK,GAAE,MAAsB,GAC5B,OAAO,CAAC,cAAc,CAAC,CAiGzB;AAED;;GAEG;AACH,wBAAgB,cAAc,CAAC,OAAO,EAAE,cAAc,GAAG,IAAI,CAoD5D;AAED;;GAEG;AACH,wBAAgB,YAAY,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,CAOhD;AAED;;GAEG;AACH,wBAAgB,iBAAiB,CAAC,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,KAAK,GAAE,MAAW,GAAG,MAAM,CAM5F;AAED;;GAEG;AACH,wBAAgB,eAAe,CAAC,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,MAAM,GAAG,IAAI,CAUtF;;;;;;;;;;;AAED,wBASE"}
|