@retab/node 0.0.0-reserved → 0.0.43

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (270) hide show
  1. package/README.md +293 -2
  2. package/dist/api/client.d.ts +15 -0
  3. package/dist/api/client.d.ts.map +1 -0
  4. package/dist/api/client.js +16 -0
  5. package/dist/api/consensus/client.d.ts +7 -0
  6. package/dist/api/consensus/client.d.ts.map +1 -0
  7. package/dist/api/consensus/client.js +14 -0
  8. package/dist/api/deployments/client.d.ts +20 -0
  9. package/dist/api/deployments/client.d.ts.map +1 -0
  10. package/dist/api/deployments/client.js +23 -0
  11. package/dist/api/documents/client.d.ts +10 -0
  12. package/dist/api/documents/client.d.ts.map +1 -0
  13. package/dist/api/documents/client.js +35 -0
  14. package/dist/api/models/client.d.ts +17 -0
  15. package/dist/api/models/client.d.ts.map +1 -0
  16. package/dist/api/models/client.js +15 -0
  17. package/dist/api/schemas/client.d.ts +12 -0
  18. package/dist/api/schemas/client.d.ts.map +1 -0
  19. package/dist/api/schemas/client.js +14 -0
  20. package/dist/client.d.ts +50 -0
  21. package/dist/client.d.ts.map +1 -0
  22. package/dist/client.js +135 -0
  23. package/dist/errors.d.ts +34 -0
  24. package/dist/errors.d.ts.map +1 -0
  25. package/dist/errors.js +53 -0
  26. package/dist/generated_types.d.ts +64373 -0
  27. package/dist/generated_types.d.ts.map +1 -0
  28. package/dist/generated_types.js +2267 -0
  29. package/dist/index.d.ts +8 -0
  30. package/dist/index.d.ts.map +1 -0
  31. package/dist/index.js +9 -0
  32. package/dist/mime.d.ts +5 -0
  33. package/dist/mime.d.ts.map +1 -0
  34. package/dist/mime.js +66 -0
  35. package/dist/resource.d.ts +12 -0
  36. package/dist/resource.d.ts.map +1 -0
  37. package/dist/resource.js +19 -0
  38. package/dist/resources/consensus/completions.d.ts +66 -0
  39. package/dist/resources/consensus/completions.d.ts.map +1 -0
  40. package/dist/resources/consensus/completions.js +84 -0
  41. package/dist/resources/consensus/index.d.ts +72 -0
  42. package/dist/resources/consensus/index.d.ts.map +1 -0
  43. package/dist/resources/consensus/index.js +76 -0
  44. package/dist/resources/consensus/responses.d.ts +69 -0
  45. package/dist/resources/consensus/responses.d.ts.map +1 -0
  46. package/dist/resources/consensus/responses.js +99 -0
  47. package/dist/resources/documents/extractions.d.ts +74 -0
  48. package/dist/resources/documents/extractions.d.ts.map +1 -0
  49. package/dist/resources/documents/extractions.js +196 -0
  50. package/dist/resources/documents/index.d.ts +21 -0
  51. package/dist/resources/documents/index.d.ts.map +1 -0
  52. package/dist/resources/documents/index.js +55 -0
  53. package/dist/resources/evaluations/documents.d.ts +40 -0
  54. package/dist/resources/evaluations/documents.d.ts.map +1 -0
  55. package/dist/resources/evaluations/documents.js +123 -0
  56. package/dist/resources/evaluations/index.d.ts +14 -0
  57. package/dist/resources/evaluations/index.d.ts.map +1 -0
  58. package/dist/resources/evaluations/index.js +17 -0
  59. package/dist/resources/evaluations/iterations.d.ts +50 -0
  60. package/dist/resources/evaluations/iterations.d.ts.map +1 -0
  61. package/dist/resources/evaluations/iterations.js +156 -0
  62. package/dist/resources/files.d.ts +82 -0
  63. package/dist/resources/files.d.ts.map +1 -0
  64. package/dist/resources/files.js +150 -0
  65. package/dist/resources/finetuning.d.ts +105 -0
  66. package/dist/resources/finetuning.d.ts.map +1 -0
  67. package/dist/resources/finetuning.js +181 -0
  68. package/dist/resources/index.d.ts +11 -0
  69. package/dist/resources/index.d.ts.map +1 -0
  70. package/dist/resources/index.js +10 -0
  71. package/dist/resources/models.d.ts +57 -0
  72. package/dist/resources/models.d.ts.map +1 -0
  73. package/dist/resources/models.js +72 -0
  74. package/dist/resources/processors/automations/endpoints.d.ts +90 -0
  75. package/dist/resources/processors/automations/endpoints.d.ts.map +1 -0
  76. package/dist/resources/processors/automations/endpoints.js +145 -0
  77. package/dist/resources/processors/automations/index.d.ts +7 -0
  78. package/dist/resources/processors/automations/index.d.ts.map +1 -0
  79. package/dist/resources/processors/automations/index.js +6 -0
  80. package/dist/resources/processors/automations/links.d.ts +90 -0
  81. package/dist/resources/processors/automations/links.d.ts.map +1 -0
  82. package/dist/resources/processors/automations/links.js +149 -0
  83. package/dist/resources/processors/automations/logs.d.ts +35 -0
  84. package/dist/resources/processors/automations/logs.d.ts.map +1 -0
  85. package/dist/resources/processors/automations/logs.js +60 -0
  86. package/dist/resources/processors/automations/mailboxes.d.ts +102 -0
  87. package/dist/resources/processors/automations/mailboxes.d.ts.map +1 -0
  88. package/dist/resources/processors/automations/mailboxes.js +157 -0
  89. package/dist/resources/processors/automations/outlook.d.ts +114 -0
  90. package/dist/resources/processors/automations/outlook.d.ts.map +1 -0
  91. package/dist/resources/processors/automations/outlook.js +170 -0
  92. package/dist/resources/processors/automations/tests.d.ts +58 -0
  93. package/dist/resources/processors/automations/tests.d.ts.map +1 -0
  94. package/dist/resources/processors/automations/tests.js +90 -0
  95. package/dist/resources/processors/index.d.ts +303 -0
  96. package/dist/resources/processors/index.d.ts.map +1 -0
  97. package/dist/resources/processors/index.js +261 -0
  98. package/dist/resources/schemas.d.ts +63 -0
  99. package/dist/resources/schemas.d.ts.map +1 -0
  100. package/dist/resources/schemas.js +183 -0
  101. package/dist/resources/secrets/external_api_keys.d.ts +61 -0
  102. package/dist/resources/secrets/external_api_keys.d.ts.map +1 -0
  103. package/dist/resources/secrets/external_api_keys.js +120 -0
  104. package/dist/resources/secrets/index.d.ts +14 -0
  105. package/dist/resources/secrets/index.d.ts.map +1 -0
  106. package/dist/resources/secrets/index.js +17 -0
  107. package/dist/resources/secrets/webhooks.d.ts +73 -0
  108. package/dist/resources/secrets/webhooks.d.ts.map +1 -0
  109. package/dist/resources/secrets/webhooks.js +145 -0
  110. package/dist/resources/usage.d.ts +223 -0
  111. package/dist/resources/usage.d.ts.map +1 -0
  112. package/dist/resources/usage.js +310 -0
  113. package/dist/types/ai_models.d.ts +389 -0
  114. package/dist/types/ai_models.d.ts.map +1 -0
  115. package/dist/types/ai_models.js +145 -0
  116. package/dist/types/automations/cron.d.ts +28 -0
  117. package/dist/types/automations/cron.d.ts.map +1 -0
  118. package/dist/types/automations/cron.js +1 -0
  119. package/dist/types/automations/endpoints.d.ts +13 -0
  120. package/dist/types/automations/endpoints.d.ts.map +1 -0
  121. package/dist/types/automations/endpoints.js +1 -0
  122. package/dist/types/automations/index.d.ts +7 -0
  123. package/dist/types/automations/index.d.ts.map +1 -0
  124. package/dist/types/automations/index.js +6 -0
  125. package/dist/types/automations/links.d.ts +15 -0
  126. package/dist/types/automations/links.d.ts.map +1 -0
  127. package/dist/types/automations/links.js +1 -0
  128. package/dist/types/automations/mailboxes.d.ts +18 -0
  129. package/dist/types/automations/mailboxes.d.ts.map +1 -0
  130. package/dist/types/automations/mailboxes.js +1 -0
  131. package/dist/types/automations/outlook.d.ts +37 -0
  132. package/dist/types/automations/outlook.d.ts.map +1 -0
  133. package/dist/types/automations/outlook.js +1 -0
  134. package/dist/types/automations/webhooks.d.ts +13 -0
  135. package/dist/types/automations/webhooks.d.ts.map +1 -0
  136. package/dist/types/automations/webhooks.js +1 -0
  137. package/dist/types/browser_canvas.d.ts +4 -0
  138. package/dist/types/browser_canvas.d.ts.map +1 -0
  139. package/dist/types/browser_canvas.js +2 -0
  140. package/dist/types/chat.d.ts +99 -0
  141. package/dist/types/chat.d.ts.map +1 -0
  142. package/dist/types/chat.js +20 -0
  143. package/dist/types/consensus.d.ts +10 -0
  144. package/dist/types/consensus.d.ts.map +1 -0
  145. package/dist/types/consensus.js +1 -0
  146. package/dist/types/db/annotations.d.ts +108 -0
  147. package/dist/types/db/annotations.d.ts.map +1 -0
  148. package/dist/types/db/annotations.js +6 -0
  149. package/dist/types/db/files.d.ts +133 -0
  150. package/dist/types/db/files.d.ts.map +1 -0
  151. package/dist/types/db/files.js +5 -0
  152. package/dist/types/documents/extractions.d.ts +1849 -0
  153. package/dist/types/documents/extractions.d.ts.map +1 -0
  154. package/dist/types/documents/extractions.js +211 -0
  155. package/dist/types/documents/processing.d.ts +249 -0
  156. package/dist/types/documents/processing.d.ts.map +1 -0
  157. package/dist/types/documents/processing.js +6 -0
  158. package/dist/types/evaluations/iterations.d.ts +41 -0
  159. package/dist/types/evaluations/iterations.d.ts.map +1 -0
  160. package/dist/types/evaluations/iterations.js +1 -0
  161. package/dist/types/jobs/base.d.ts +162 -0
  162. package/dist/types/jobs/base.d.ts.map +1 -0
  163. package/dist/types/jobs/base.js +6 -0
  164. package/dist/types/jobs/specialized.d.ts +200 -0
  165. package/dist/types/jobs/specialized.d.ts.map +1 -0
  166. package/dist/types/jobs/specialized.js +37 -0
  167. package/dist/types/logs.d.ts +92 -0
  168. package/dist/types/logs.d.ts.map +1 -0
  169. package/dist/types/logs.js +1 -0
  170. package/dist/types/mime.d.ts +426 -0
  171. package/dist/types/mime.d.ts.map +1 -0
  172. package/dist/types/mime.js +48 -0
  173. package/dist/types/modalities.d.ts +31 -0
  174. package/dist/types/modalities.d.ts.map +1 -0
  175. package/dist/types/modalities.js +109 -0
  176. package/dist/types/pagination.d.ts +5 -0
  177. package/dist/types/pagination.d.ts.map +1 -0
  178. package/dist/types/pagination.js +1 -0
  179. package/dist/types/schemas/enhancement.d.ts +250 -0
  180. package/dist/types/schemas/enhancement.d.ts.map +1 -0
  181. package/dist/types/schemas/enhancement.js +6 -0
  182. package/dist/types/schemas/generate.d.ts +160 -0
  183. package/dist/types/schemas/generate.d.ts.map +1 -0
  184. package/dist/types/schemas/generate.js +19 -0
  185. package/dist/types/schemas/object.d.ts +116 -0
  186. package/dist/types/schemas/object.d.ts.map +1 -0
  187. package/dist/types/schemas/object.js +861 -0
  188. package/dist/types/secrets/external_api_keys.d.ts +27 -0
  189. package/dist/types/secrets/external_api_keys.d.ts.map +1 -0
  190. package/dist/types/secrets/external_api_keys.js +11 -0
  191. package/dist/types/secrets/index.d.ts +2 -0
  192. package/dist/types/secrets/index.d.ts.map +1 -0
  193. package/dist/types/secrets/index.js +1 -0
  194. package/dist/types/standards.d.ts +37 -0
  195. package/dist/types/standards.d.ts.map +1 -0
  196. package/dist/types/standards.js +1 -0
  197. package/dist/types.d.ts +276 -0
  198. package/dist/types.d.ts.map +1 -0
  199. package/dist/types.js +85 -0
  200. package/dist/utils/ai_models.d.ts +10 -0
  201. package/dist/utils/ai_models.d.ts.map +1 -0
  202. package/dist/utils/ai_models.js +183 -0
  203. package/dist/utils/batch_processing.d.ts +227 -0
  204. package/dist/utils/batch_processing.d.ts.map +1 -0
  205. package/dist/utils/batch_processing.js +268 -0
  206. package/dist/utils/benchmarking.d.ts +115 -0
  207. package/dist/utils/benchmarking.d.ts.map +1 -0
  208. package/dist/utils/benchmarking.js +355 -0
  209. package/dist/utils/chat.d.ts +70 -0
  210. package/dist/utils/chat.d.ts.map +1 -0
  211. package/dist/utils/chat.js +79 -0
  212. package/dist/utils/cost_calculation.d.ts +26 -0
  213. package/dist/utils/cost_calculation.d.ts.map +1 -0
  214. package/dist/utils/cost_calculation.js +99 -0
  215. package/dist/utils/datasets.d.ts +135 -0
  216. package/dist/utils/datasets.d.ts.map +1 -0
  217. package/dist/utils/datasets.js +359 -0
  218. package/dist/utils/display.d.ts +108 -0
  219. package/dist/utils/display.d.ts.map +1 -0
  220. package/dist/utils/display.js +244 -0
  221. package/dist/utils/hash.d.ts +18 -0
  222. package/dist/utils/hash.d.ts.map +1 -0
  223. package/dist/utils/hash.js +31 -0
  224. package/dist/utils/hashing.d.ts +18 -0
  225. package/dist/utils/hashing.d.ts.map +1 -0
  226. package/dist/utils/hashing.js +28 -0
  227. package/dist/utils/index.d.ts +8 -0
  228. package/dist/utils/index.d.ts.map +1 -0
  229. package/dist/utils/index.js +10 -0
  230. package/dist/utils/json_schema.d.ts +18 -0
  231. package/dist/utils/json_schema.d.ts.map +1 -0
  232. package/dist/utils/json_schema.js +334 -0
  233. package/dist/utils/json_schema_utils.d.ts +42 -0
  234. package/dist/utils/json_schema_utils.d.ts.map +1 -0
  235. package/dist/utils/json_schema_utils.js +212 -0
  236. package/dist/utils/jsonl.d.ts +60 -0
  237. package/dist/utils/jsonl.d.ts.map +1 -0
  238. package/dist/utils/jsonl.js +259 -0
  239. package/dist/utils/mime.d.ts +6 -0
  240. package/dist/utils/mime.d.ts.map +1 -0
  241. package/dist/utils/mime.js +129 -0
  242. package/dist/utils/model_cards.d.ts +219 -0
  243. package/dist/utils/model_cards.d.ts.map +1 -0
  244. package/dist/utils/model_cards.js +462 -0
  245. package/dist/utils/prompt_optimization.d.ts +96 -0
  246. package/dist/utils/prompt_optimization.d.ts.map +1 -0
  247. package/dist/utils/prompt_optimization.js +275 -0
  248. package/dist/utils/responses.d.ts +35 -0
  249. package/dist/utils/responses.d.ts.map +1 -0
  250. package/dist/utils/responses.js +37 -0
  251. package/dist/utils/stream.d.ts +13 -0
  252. package/dist/utils/stream.d.ts.map +1 -0
  253. package/dist/utils/stream.js +64 -0
  254. package/dist/utils/stream_context_managers.d.ts +147 -0
  255. package/dist/utils/stream_context_managers.d.ts.map +1 -0
  256. package/dist/utils/stream_context_managers.js +380 -0
  257. package/dist/utils/usage.d.ts +57 -0
  258. package/dist/utils/usage.d.ts.map +1 -0
  259. package/dist/utils/usage.js +97 -0
  260. package/dist/utils/webhook_secrets.d.ts +59 -0
  261. package/dist/utils/webhook_secrets.d.ts.map +1 -0
  262. package/dist/utils/webhook_secrets.js +107 -0
  263. package/dist/utils/zod_to_json_schema.d.ts +11 -0
  264. package/dist/utils/zod_to_json_schema.d.ts.map +1 -0
  265. package/dist/utils/zod_to_json_schema.js +123 -0
  266. package/dist/utils.d.ts +19 -0
  267. package/dist/utils.d.ts.map +1 -0
  268. package/dist/utils.js +1 -0
  269. package/package.json +62 -6
  270. package/index.js +0 -7
@@ -0,0 +1,99 @@
1
+ // Basic pricing data for common models (this would typically come from a config or API)
2
+ const MODEL_PRICING = {
3
+ 'gpt-4o': {
4
+ text: { prompt: 2.5, completion: 10.0, cached_discount: 1.0 },
5
+ ft_price_hike: 1.0,
6
+ },
7
+ 'gpt-4o-mini': {
8
+ text: { prompt: 0.15, completion: 0.6, cached_discount: 1.0 },
9
+ ft_price_hike: 1.0,
10
+ },
11
+ 'gpt-4o-2024-11-20': {
12
+ text: { prompt: 2.5, completion: 10.0, cached_discount: 1.0 },
13
+ ft_price_hike: 1.0,
14
+ },
15
+ 'gpt-4o-2024-08-06': {
16
+ text: { prompt: 2.5, completion: 10.0, cached_discount: 1.0 },
17
+ ft_price_hike: 1.0,
18
+ },
19
+ 'gpt-4o-mini-2024-07-18': {
20
+ text: { prompt: 0.15, completion: 0.6, cached_discount: 1.0 },
21
+ ft_price_hike: 1.0,
22
+ },
23
+ 'claude-3-5-sonnet-latest': {
24
+ text: { prompt: 3.0, completion: 15.0, cached_discount: 1.0 },
25
+ ft_price_hike: 1.0,
26
+ },
27
+ 'claude-3-5-sonnet-20241022': {
28
+ text: { prompt: 3.0, completion: 15.0, cached_discount: 1.0 },
29
+ ft_price_hike: 1.0,
30
+ },
31
+ 'gemini-2.0-flash': {
32
+ text: { prompt: 0.075, completion: 0.3, cached_discount: 1.0 },
33
+ ft_price_hike: 1.0,
34
+ },
35
+ 'gemini-2.5-pro': {
36
+ text: { prompt: 1.25, completion: 5.0, cached_discount: 1.0 },
37
+ ft_price_hike: 1.0,
38
+ },
39
+ };
40
+ /**
41
+ * Compute the cost of a model usage.
42
+ */
43
+ export function computeCostFromModel(model, usage, currency = 'USD') {
44
+ const pricing = MODEL_PRICING[model];
45
+ if (!pricing) {
46
+ // Return zero cost for unknown models
47
+ return { value: 0, currency };
48
+ }
49
+ const promptTokens = usage.prompt_tokens || 0;
50
+ const completionTokens = usage.completion_tokens || 0;
51
+ const cachedTokens = usage.cached_tokens || 0;
52
+ // Calculate costs per 1M tokens
53
+ const promptCost = (promptTokens / 1000000) * pricing.text.prompt;
54
+ const completionCost = (completionTokens / 1000000) * pricing.text.completion;
55
+ // Apply cached discount if applicable
56
+ const cachedCost = (cachedTokens / 1000000) * pricing.text.prompt * pricing.text.cached_discount;
57
+ const totalCost = promptCost + completionCost + cachedCost;
58
+ return {
59
+ value: Math.round(totalCost * 100000) / 100000, // Round to 5 decimal places
60
+ currency,
61
+ };
62
+ }
63
+ export function computeCostFromModelWithBreakdown(model, usage, currency = 'USD') {
64
+ const pricing = MODEL_PRICING[model];
65
+ if (!pricing) {
66
+ const zeroCost = { value: 0, currency };
67
+ return {
68
+ prompt_cost: zeroCost,
69
+ completion_cost: zeroCost,
70
+ cached_cost: zeroCost,
71
+ total_cost: zeroCost,
72
+ prompt_tokens: usage.prompt_tokens || 0,
73
+ completion_tokens: usage.completion_tokens || 0,
74
+ cached_tokens: usage.cached_tokens || 0,
75
+ };
76
+ }
77
+ const promptTokens = usage.prompt_tokens || 0;
78
+ const completionTokens = usage.completion_tokens || 0;
79
+ const cachedTokens = usage.cached_tokens || 0;
80
+ const promptCostValue = (promptTokens / 1000000) * pricing.text.prompt;
81
+ const completionCostValue = (completionTokens / 1000000) * pricing.text.completion;
82
+ const cachedCostValue = (cachedTokens / 1000000) * pricing.text.prompt * pricing.text.cached_discount;
83
+ const promptCost = { value: Math.round(promptCostValue * 100000) / 100000, currency };
84
+ const completionCost = { value: Math.round(completionCostValue * 100000) / 100000, currency };
85
+ const cachedCost = { value: Math.round(cachedCostValue * 100000) / 100000, currency };
86
+ const totalCost = {
87
+ value: Math.round((promptCostValue + completionCostValue + cachedCostValue) * 100000) / 100000,
88
+ currency
89
+ };
90
+ return {
91
+ prompt_cost: promptCost,
92
+ completion_cost: completionCost,
93
+ cached_cost: cachedCost,
94
+ total_cost: totalCost,
95
+ prompt_tokens: promptTokens,
96
+ completion_tokens: completionTokens,
97
+ cached_tokens: cachedTokens,
98
+ };
99
+ }
@@ -0,0 +1,135 @@
1
+ import { SyncAPIResource, AsyncAPIResource } from '../resource.js';
2
+ import { DatasetMetrics } from './display.js';
3
+ /**
4
+ * Advanced Dataset management utilities for ML training workflows
5
+ * Equivalent to Python's jsonlUtils.py
6
+ */
7
+ export interface FinetuningJSON {
8
+ messages: Array<{
9
+ role: 'system' | 'user' | 'assistant';
10
+ content: string;
11
+ }>;
12
+ }
13
+ export interface DocumentAnnotationPair {
14
+ document: string | Buffer;
15
+ annotation: Record<string, any>;
16
+ }
17
+ export interface BatchJSONLRequest {
18
+ custom_id: string;
19
+ method: 'POST';
20
+ url: string;
21
+ body: Record<string, any>;
22
+ }
23
+ export interface BatchJSONLResponse {
24
+ id: string;
25
+ custom_id: string;
26
+ response: {
27
+ status_code: number;
28
+ request_id: string;
29
+ body: Record<string, any>;
30
+ };
31
+ error?: {
32
+ code: string;
33
+ message: string;
34
+ };
35
+ }
36
+ export interface AnnotationOptions {
37
+ model?: string;
38
+ temperature?: number;
39
+ modality?: 'native' | 'text';
40
+ maxConcurrency?: number;
41
+ reasoning_effort?: 'low' | 'medium' | 'high';
42
+ provider?: 'openai' | 'anthropic' | 'xai' | 'gemini';
43
+ idempotencyKey?: string;
44
+ }
45
+ export interface SaveOptions {
46
+ modality?: 'native' | 'text';
47
+ imageResolutionDpi?: number;
48
+ browserCanvas?: 'A3' | 'A4' | 'A5';
49
+ }
50
+ export declare class BaseDatasetsMixin {
51
+ /**
52
+ * Process dataset and compute comprehensive metrics
53
+ */
54
+ pprint(datasetPath: string, inputTokenPrice?: number, outputTokenPrice?: number): Promise<DatasetMetrics>;
55
+ /**
56
+ * Save document-annotation pairs as JSONL training dataset
57
+ */
58
+ save(jsonSchema: Record<string, any> | string, documentAnnotationPairsPaths: Array<{
59
+ document: string;
60
+ annotation: string;
61
+ }>, datasetPath: string, options?: SaveOptions): Promise<void>;
62
+ /**
63
+ * Change schema in existing dataset
64
+ */
65
+ changeSchema(inputDatasetPath: string, jsonSchema: Record<string, any> | string, outputDatasetPath?: string, inplace?: boolean): Promise<void>;
66
+ /**
67
+ * Stitch multiple documents and save as dataset
68
+ */
69
+ stitchAndSave(jsonSchema: Record<string, any> | string, pairsPaths: Array<{
70
+ documents: string[];
71
+ annotation: string;
72
+ }>, datasetPath: string, modality?: 'native' | 'text'): Promise<void>;
73
+ /**
74
+ * Generate annotations for documents using AI models
75
+ */
76
+ annotate(jsonSchema: Record<string, any> | string, documents: string[], datasetPath: string, options?: AnnotationOptions): Promise<void>;
77
+ /**
78
+ * Update existing annotations with new model/schema
79
+ */
80
+ updateAnnotations(jsonSchema: Record<string, any> | string, oldDatasetPath: string, newDatasetPath: string, options?: AnnotationOptions): Promise<void>;
81
+ /**
82
+ * Save batch annotation requests for OpenAI Batch API
83
+ */
84
+ saveBatchAnnotateRequests(jsonSchema: Record<string, any> | string, documents: string[], batchRequestsPath: string, options?: AnnotationOptions): Promise<void>;
85
+ /**
86
+ * Build dataset from batch API results
87
+ */
88
+ buildDatasetFromBatchResults(jsonSchema: Record<string, any> | string, batchResultsPath: string, datasetPath: string, modality?: 'native' | 'text'): Promise<void>;
89
+ private createSystemMessage;
90
+ private createUserMessage;
91
+ private createMultiDocumentUserMessage;
92
+ private generateAnnotation;
93
+ private generateAnnotationFromUserMessage;
94
+ }
95
+ export declare class Datasets extends SyncAPIResource {
96
+ private mixin;
97
+ pprint(datasetPath: string, inputTokenPrice?: number, outputTokenPrice?: number): Promise<DatasetMetrics>;
98
+ save(jsonSchema: Record<string, any> | string, documentAnnotationPairsPaths: Array<{
99
+ document: string;
100
+ annotation: string;
101
+ }>, datasetPath: string, options?: SaveOptions): Promise<void>;
102
+ changeSchema(inputDatasetPath: string, jsonSchema: Record<string, any> | string, outputDatasetPath?: string, inplace?: boolean): Promise<void>;
103
+ stitchAndSave(jsonSchema: Record<string, any> | string, pairsPaths: Array<{
104
+ documents: string[];
105
+ annotation: string;
106
+ }>, datasetPath: string, modality?: 'native' | 'text'): Promise<void>;
107
+ annotate(jsonSchema: Record<string, any> | string, documents: string[], datasetPath: string, options?: AnnotationOptions): Promise<void>;
108
+ updateAnnotations(jsonSchema: Record<string, any> | string, oldDatasetPath: string, newDatasetPath: string, options?: AnnotationOptions): Promise<void>;
109
+ saveBatchAnnotateRequests(jsonSchema: Record<string, any> | string, documents: string[], batchRequestsPath: string, options?: AnnotationOptions): Promise<void>;
110
+ buildDatasetFromBatchResults(jsonSchema: Record<string, any> | string, batchResultsPath: string, datasetPath: string, modality?: 'native' | 'text'): Promise<void>;
111
+ }
112
+ export declare class AsyncDatasets extends AsyncAPIResource {
113
+ private mixin;
114
+ pprint(datasetPath: string, inputTokenPrice?: number, outputTokenPrice?: number): Promise<DatasetMetrics>;
115
+ save(jsonSchema: Record<string, any> | string, documentAnnotationPairsPaths: Array<{
116
+ document: string;
117
+ annotation: string;
118
+ }>, datasetPath: string, options?: SaveOptions): Promise<void>;
119
+ changeSchema(inputDatasetPath: string, jsonSchema: Record<string, any> | string, outputDatasetPath?: string, inplace?: boolean): Promise<void>;
120
+ stitchAndSave(jsonSchema: Record<string, any> | string, pairsPaths: Array<{
121
+ documents: string[];
122
+ annotation: string;
123
+ }>, datasetPath: string, modality?: 'native' | 'text'): Promise<void>;
124
+ annotate(jsonSchema: Record<string, any> | string, documents: string[], datasetPath: string, options?: AnnotationOptions): Promise<void>;
125
+ updateAnnotations(jsonSchema: Record<string, any> | string, oldDatasetPath: string, newDatasetPath: string, options?: AnnotationOptions): Promise<void>;
126
+ saveBatchAnnotateRequests(jsonSchema: Record<string, any> | string, documents: string[], batchRequestsPath: string, options?: AnnotationOptions): Promise<void>;
127
+ buildDatasetFromBatchResults(jsonSchema: Record<string, any> | string, batchResultsPath: string, datasetPath: string, modality?: 'native' | 'text'): Promise<void>;
128
+ }
129
+ declare const _default: {
130
+ Datasets: typeof Datasets;
131
+ AsyncDatasets: typeof AsyncDatasets;
132
+ BaseDatasetsMixin: typeof BaseDatasetsMixin;
133
+ };
134
+ export default _default;
135
+ //# sourceMappingURL=datasets.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"datasets.d.ts","sourceRoot":"","sources":["../../src/utils/datasets.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,eAAe,EAAE,gBAAgB,EAAE,MAAM,gBAAgB,CAAC;AAEnE,OAAO,EAAmD,cAAc,EAAE,MAAM,cAAc,CAAC;AAE/F;;;GAGG;AAEH,MAAM,WAAW,cAAc;IAC7B,QAAQ,EAAE,KAAK,CAAC;QACd,IAAI,EAAE,QAAQ,GAAG,MAAM,GAAG,WAAW,CAAC;QACtC,OAAO,EAAE,MAAM,CAAC;KACjB,CAAC,CAAC;CACJ;AAED,MAAM,WAAW,sBAAsB;IACrC,QAAQ,EAAE,MAAM,GAAG,MAAM,CAAC;IAC1B,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;CACjC;AAED,MAAM,WAAW,iBAAiB;IAChC,SAAS,EAAE,MAAM,CAAC;IAClB,MAAM,EAAE,MAAM,CAAC;IACf,GAAG,EAAE,MAAM,CAAC;IACZ,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;CAC3B;AAED,MAAM,WAAW,kBAAkB;IACjC,EAAE,EAAE,MAAM,CAAC;IACX,SAAS,EAAE,MAAM,CAAC;IAClB,QAAQ,EAAE;QACR,WAAW,EAAE,MAAM,CAAC;QACpB,UAAU,EAAE,MAAM,CAAC;QACnB,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;KAC3B,CAAC;IACF,KAAK,CAAC,EAAE;QACN,IAAI,EAAE,MAAM,CAAC;QACb,OAAO,EAAE,MAAM,CAAC;KACjB,CAAC;CACH;AAED,MAAM,WAAW,iBAAiB;IAChC,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,EAAE,QAAQ,GAAG,MAAM,CAAC;IAC7B,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,gBAAgB,CAAC,EAAE,KAAK,GAAG,QAAQ,GAAG,MAAM,CAAC;IAC7C,QAAQ,CAAC,EAAE,QAAQ,GAAG,WAAW,GAAG,KAAK,GAAG,QAAQ,CAAC;IACrD,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AAED,MAAM,WAAW,WAAW;IAC1B,QAAQ,CAAC,EAAE,QAAQ,GAAG,MAAM,CAAC;IAC7B,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,aAAa,CAAC,EAAE,IAAI,GAAG,IAAI,GAAG,IAAI,CAAC;CACpC;AAED,qBAAa,iBAAiB;IAC5B;;OAEG;IACG,MAAM,CACV,WAAW,EAAE,MAAM,EACnB,eAAe,GAAE,MAAgB,EACjC,gBAAgB,GAAE,MAAe,GAChC,OAAO,CAAC,cAAc,CAAC;IAe1B;;OAEG;IACG,IAAI,CACR,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,4BAA4B,EAAE,KAAK,CAAC;QAClC,QAAQ,EAAE,MAAM,CAAC;QACjB,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC,EACF,WAAW,EAAE,MAAM,EACnB,OAAO,GAAE,WAAgB,GACxB,OAAO,CAAC,IAAI,CAAC;IAkChB;;OAEG;IACG,YAAY,CAChB,gBAAgB,EAAE,MAAM,EACxB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,iBAAiB,CAAC,EAAE,MAAM,EAC1B,OAAO,GAAE,OAAe,GACvB,OAAO,CAAC,IAAI,CAAC;IAkChB;;OAEG;IACG,aAAa,CACjB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,UAAU,EAAE,KAAK,CAAC;QAChB,SAAS,EAAE,MAAM,EAAE,CAAC;QACpB,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC,EACF,WAAW,EAAE,MAAM,EACnB,QAAQ,GAAE,QAAQ,GAAG,MAAiB,GACrC,OAAO,CAAC,IAAI,CAAC;IAkChB;;OAEG;IACG,QAAQ,CACZ,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,SAAS,EAAE,MAAM,EAAE,EACnB,WAAW,EAAE,MAAM,EACnB,OAAO,GAAE,iBAAsB,GAC9B,OAAO,CAAC,IAAI,CAAC;IAyDhB;;OAEG;IACG,iBAAiB,CACrB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,cAAc,EAAE,MAAM,EACtB,cAAc,EAAE,MAAM,EACtB,OAAO,GAAE,iBAAsB,GAC9B,OAAO,CAAC,IAAI,CAAC;IAoDhB;;OAEG;IACG,yBAAyB,CAC7B,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,SAAS,EAAE,MAAM,EAAE,EACnB,iBAAiB,EAAE,MAAM,EACzB,OAAO,GAAE,iBAAsB,GAC9B,OAAO,CAAC,IAAI,CAAC;IA+BhB;;OAEG;IACG,4BAA4B,CAChC,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,gBAAgB,EAAE,MAAM,EACxB,WAAW,EAAE,MAAM,EACnB,QAAQ,GAAE,QAAQ,GAAG,MAAiB,GACrC,OAAO,CAAC,IAAI,CAAC;IAoDhB,OAAO,CAAC,mBAAmB;YAab,iBAAiB;YAejB,8BAA8B;YAe9B,kBAAkB;YAclB,iCAAiC;CAQhD;AAED,qBAAa,QAAS,SAAQ,eAAe;IAC3C,OAAO,CAAC,KAAK,CAA2B;IAElC,MAAM,CACV,WAAW,EAAE,MAAM,EACnB,eAAe,CAAC,EAAE,MAAM,EACxB,gBAAgB,CAAC,EAAE,MAAM,GACxB,OAAO,CAAC,cAAc,CAAC;IAIpB,IAAI,CACR,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,4BAA4B,EAAE,KAAK,CAAC;QAAE,QAAQ,EAAE,MAAM,CAAC;QAAC,UAAU,EAAE,MAAM,CAAA;KAAE,CAAC,EAC7E,WAAW,EAAE,MAAM,EACnB,OAAO,CAAC,EAAE,WAAW,GACpB,OAAO,CAAC,IAAI,CAAC;IAIV,YAAY,CAChB,gBAAgB,EAAE,MAAM,EACxB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,iBAAiB,CAAC,EAAE,MAAM,EAC1B,OAAO,CAAC,EAAE,OAAO,GAChB,OAAO,CAAC,IAAI,CAAC;IAIV,aAAa,CACjB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,UAAU,EAAE,KAAK,CAAC;QAAE,SAAS,EAAE,MAAM,EAAE,CAAC;QAAC,UAAU,EAAE,MAAM,CAAA;KAAE,CAAC,EAC9D,WAAW,EAAE,MAAM,EACnB,QAAQ,CAAC,EAAE,QAAQ,GAAG,MAAM,GAC3B,OAAO,CAAC,IAAI,CAAC;IAIV,QAAQ,CACZ,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,SAAS,EAAE,MAAM,EAAE,EACnB,WAAW,EAAE,MAAM,EACnB,OAAO,CAAC,EAAE,iBAAiB,GAC1B,OAAO,CAAC,IAAI,CAAC;IAIV,iBAAiB,CACrB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,cAAc,EAAE,MAAM,EACtB,cAAc,EAAE,MAAM,EACtB,OAAO,CAAC,EAAE,iBAAiB,GAC1B,OAAO,CAAC,IAAI,CAAC;IAIV,yBAAyB,CAC7B,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,SAAS,EAAE,MAAM,EAAE,EACnB,iBAAiB,EAAE,MAAM,EACzB,OAAO,CAAC,EAAE,iBAAiB,GAC1B,OAAO,CAAC,IAAI,CAAC;IAIV,4BAA4B,CAChC,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,gBAAgB,EAAE,MAAM,EACxB,WAAW,EAAE,MAAM,EACnB,QAAQ,CAAC,EAAE,QAAQ,GAAG,MAAM,GAC3B,OAAO,CAAC,IAAI,CAAC;CAGjB;AAED,qBAAa,aAAc,SAAQ,gBAAgB;IACjD,OAAO,CAAC,KAAK,CAA2B;IAElC,MAAM,CACV,WAAW,EAAE,MAAM,EACnB,eAAe,CAAC,EAAE,MAAM,EACxB,gBAAgB,CAAC,EAAE,MAAM,GACxB,OAAO,CAAC,cAAc,CAAC;IAIpB,IAAI,CACR,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,4BAA4B,EAAE,KAAK,CAAC;QAAE,QAAQ,EAAE,MAAM,CAAC;QAAC,UAAU,EAAE,MAAM,CAAA;KAAE,CAAC,EAC7E,WAAW,EAAE,MAAM,EACnB,OAAO,CAAC,EAAE,WAAW,GACpB,OAAO,CAAC,IAAI,CAAC;IAIV,YAAY,CAChB,gBAAgB,EAAE,MAAM,EACxB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,iBAAiB,CAAC,EAAE,MAAM,EAC1B,OAAO,CAAC,EAAE,OAAO,GAChB,OAAO,CAAC,IAAI,CAAC;IAIV,aAAa,CACjB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,UAAU,EAAE,KAAK,CAAC;QAAE,SAAS,EAAE,MAAM,EAAE,CAAC;QAAC,UAAU,EAAE,MAAM,CAAA;KAAE,CAAC,EAC9D,WAAW,EAAE,MAAM,EACnB,QAAQ,CAAC,EAAE,QAAQ,GAAG,MAAM,GAC3B,OAAO,CAAC,IAAI,CAAC;IAIV,QAAQ,CACZ,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,SAAS,EAAE,MAAM,EAAE,EACnB,WAAW,EAAE,MAAM,EACnB,OAAO,CAAC,EAAE,iBAAiB,GAC1B,OAAO,CAAC,IAAI,CAAC;IAIV,iBAAiB,CACrB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,cAAc,EAAE,MAAM,EACtB,cAAc,EAAE,MAAM,EACtB,OAAO,CAAC,EAAE,iBAAiB,GAC1B,OAAO,CAAC,IAAI,CAAC;IAIV,yBAAyB,CAC7B,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,SAAS,EAAE,MAAM,EAAE,EACnB,iBAAiB,EAAE,MAAM,EACzB,OAAO,CAAC,EAAE,iBAAiB,GAC1B,OAAO,CAAC,IAAI,CAAC;IAIV,4BAA4B,CAChC,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,MAAM,EACxC,gBAAgB,EAAE,MAAM,EACxB,WAAW,EAAE,MAAM,EACnB,QAAQ,CAAC,EAAE,QAAQ,GAAG,MAAM,GAC3B,OAAO,CAAC,IAAI,CAAC;CAGjB;;;;;;AAED,wBAIE"}
@@ -0,0 +1,359 @@
1
+ import fs from 'fs';
2
+ import path from 'path';
3
+ import { SyncAPIResource, AsyncAPIResource } from '../resource.js';
4
+ import { readJSONL, writeJSONL } from './jsonl.js';
5
+ import { displayMetrics, processDatasetAndComputeMetrics } from './display.js';
6
+ export class BaseDatasetsMixin {
7
+ /**
8
+ * Process dataset and compute comprehensive metrics
9
+ */
10
+ async pprint(datasetPath, inputTokenPrice = 0.00015, outputTokenPrice = 0.0006) {
11
+ if (!fs.existsSync(datasetPath)) {
12
+ throw new Error(`Dataset file not found: ${datasetPath}`);
13
+ }
14
+ const metrics = await processDatasetAndComputeMetrics(datasetPath, inputTokenPrice, outputTokenPrice);
15
+ displayMetrics(metrics);
16
+ return metrics;
17
+ }
18
+ /**
19
+ * Save document-annotation pairs as JSONL training dataset
20
+ */
21
+ async save(jsonSchema, documentAnnotationPairsPaths, datasetPath, options = {}) {
22
+ const { modality = 'native' } = options;
23
+ const finetuningData = [];
24
+ for (const { document: docPath, annotation: annPath } of documentAnnotationPairsPaths) {
25
+ // Read document and annotation
26
+ if (!fs.existsSync(docPath) || !fs.existsSync(annPath)) {
27
+ throw new Error(`Document or annotation file not found: ${docPath}, ${annPath}`);
28
+ }
29
+ const annotation = JSON.parse(fs.readFileSync(annPath, 'utf-8'));
30
+ // Create system message with schema
31
+ const systemMessage = this.createSystemMessage(jsonSchema, modality);
32
+ // Create user message with document
33
+ const userMessage = await this.createUserMessage(docPath, modality, options);
34
+ // Create assistant message with annotation
35
+ const assistantMessage = {
36
+ role: 'assistant',
37
+ content: JSON.stringify(annotation),
38
+ };
39
+ finetuningData.push({
40
+ messages: [systemMessage, userMessage, assistantMessage],
41
+ });
42
+ }
43
+ // Write to JSONL file
44
+ await writeJSONL(datasetPath, finetuningData);
45
+ console.log(`✅ Dataset saved to ${datasetPath} with ${finetuningData.length} examples`);
46
+ }
47
+ /**
48
+ * Change schema in existing dataset
49
+ */
50
+ async changeSchema(inputDatasetPath, jsonSchema, outputDatasetPath, inplace = false) {
51
+ if (!fs.existsSync(inputDatasetPath)) {
52
+ throw new Error(`Input dataset not found: ${inputDatasetPath}`);
53
+ }
54
+ const outputPath = inplace ? inputDatasetPath : (outputDatasetPath || inputDatasetPath);
55
+ const tempPath = `${outputPath}.tmp`;
56
+ try {
57
+ const dataset = await readJSONL(inputDatasetPath);
58
+ const newSystemMessage = this.createSystemMessage(jsonSchema, 'native');
59
+ const updatedDataset = dataset.map((item) => ({
60
+ ...item,
61
+ messages: [
62
+ newSystemMessage,
63
+ ...item.messages.slice(1), // Keep user and assistant messages
64
+ ],
65
+ }));
66
+ await writeJSONL(tempPath, updatedDataset);
67
+ // Atomic move
68
+ fs.renameSync(tempPath, outputPath);
69
+ console.log(`✅ Schema updated in ${outputPath}`);
70
+ }
71
+ catch (error) {
72
+ // Cleanup temp file on error
73
+ if (fs.existsSync(tempPath)) {
74
+ fs.unlinkSync(tempPath);
75
+ }
76
+ throw error;
77
+ }
78
+ }
79
+ /**
80
+ * Stitch multiple documents and save as dataset
81
+ */
82
+ async stitchAndSave(jsonSchema, pairsPaths, datasetPath, modality = 'native') {
83
+ const finetuningData = [];
84
+ for (const { documents: docPaths, annotation: annPath } of pairsPaths) {
85
+ if (!fs.existsSync(annPath)) {
86
+ throw new Error(`Annotation file not found: ${annPath}`);
87
+ }
88
+ // Verify all document files exist
89
+ for (const docPath of docPaths) {
90
+ if (!fs.existsSync(docPath)) {
91
+ throw new Error(`Document file not found: ${docPath}`);
92
+ }
93
+ }
94
+ const annotation = JSON.parse(fs.readFileSync(annPath, 'utf-8'));
95
+ const systemMessage = this.createSystemMessage(jsonSchema, modality);
96
+ const userMessage = await this.createMultiDocumentUserMessage(docPaths, modality);
97
+ const assistantMessage = {
98
+ role: 'assistant',
99
+ content: JSON.stringify(annotation),
100
+ };
101
+ finetuningData.push({
102
+ messages: [systemMessage, userMessage, assistantMessage],
103
+ });
104
+ }
105
+ await writeJSONL(datasetPath, finetuningData);
106
+ console.log(`✅ Stitched dataset saved to ${datasetPath} with ${finetuningData.length} examples`);
107
+ }
108
+ /**
109
+ * Generate annotations for documents using AI models
110
+ */
111
+ async annotate(jsonSchema, documents, datasetPath, options = {}) {
112
+ const { model = 'gpt-4o-mini', temperature = 0.0, modality = 'native', maxConcurrency = 5, reasoning_effort = 'medium', provider = 'openai', } = options;
113
+ console.log(`🚀 Starting annotation of ${documents.length} documents...`);
114
+ const finetuningData = [];
115
+ const concurrencyLimit = Math.min(maxConcurrency, documents.length);
116
+ // Process documents in batches
117
+ for (let i = 0; i < documents.length; i += concurrencyLimit) {
118
+ const batch = documents.slice(i, i + concurrencyLimit);
119
+ const batchPromises = batch.map(async (docPath, index) => {
120
+ const globalIndex = i + index;
121
+ console.log(`📝 Processing document ${globalIndex + 1}/${documents.length}: ${path.basename(docPath)}`);
122
+ try {
123
+ const annotation = await this.generateAnnotation(jsonSchema, docPath, model, temperature, modality, reasoning_effort, provider);
124
+ const systemMessage = this.createSystemMessage(jsonSchema, modality);
125
+ const userMessage = await this.createUserMessage(docPath, modality);
126
+ const assistantMessage = {
127
+ role: 'assistant',
128
+ content: JSON.stringify(annotation),
129
+ };
130
+ return {
131
+ messages: [systemMessage, userMessage, assistantMessage],
132
+ };
133
+ }
134
+ catch (error) {
135
+ console.error(`❌ Failed to process ${docPath}:`, error);
136
+ return null;
137
+ }
138
+ });
139
+ const batchResults = await Promise.all(batchPromises);
140
+ finetuningData.push(...batchResults.filter(result => result !== null));
141
+ }
142
+ await writeJSONL(datasetPath, finetuningData);
143
+ console.log(`✅ Annotation complete! Generated ${finetuningData.length}/${documents.length} annotations`);
144
+ }
145
+ /**
146
+ * Update existing annotations with new model/schema
147
+ */
148
+ async updateAnnotations(jsonSchema, oldDatasetPath, newDatasetPath, options = {}) {
149
+ if (!fs.existsSync(oldDatasetPath)) {
150
+ throw new Error(`Old dataset not found: ${oldDatasetPath}`);
151
+ }
152
+ console.log(`🔄 Updating annotations from ${oldDatasetPath}...`);
153
+ const oldDataset = await readJSONL(oldDatasetPath);
154
+ const updatedDataset = [];
155
+ for (let i = 0; i < oldDataset.length; i++) {
156
+ const item = oldDataset[i];
157
+ console.log(`🔄 Updating annotation ${i + 1}/${oldDataset.length}`);
158
+ try {
159
+ // Extract document path from user message (this is simplified)
160
+ const userContent = item.messages.find(m => m.role === 'user')?.content;
161
+ if (!userContent) {
162
+ console.warn(`⚠️ No user message found in item ${i + 1}, skipping`);
163
+ continue;
164
+ }
165
+ // For this implementation, we assume the document is referenced in the user message
166
+ // In practice, you'd need to store document paths or reconstruct them
167
+ const newAnnotation = await this.generateAnnotationFromUserMessage(jsonSchema, userContent, options);
168
+ const systemMessage = this.createSystemMessage(jsonSchema, options.modality || 'native');
169
+ const assistantMessage = {
170
+ role: 'assistant',
171
+ content: JSON.stringify(newAnnotation),
172
+ };
173
+ updatedDataset.push({
174
+ messages: [
175
+ systemMessage,
176
+ item.messages.find(m => m.role === 'user'),
177
+ assistantMessage,
178
+ ],
179
+ });
180
+ }
181
+ catch (error) {
182
+ console.error(`❌ Failed to update annotation ${i + 1}:`, error);
183
+ }
184
+ }
185
+ await writeJSONL(newDatasetPath, updatedDataset);
186
+ console.log(`✅ Updated ${updatedDataset.length}/${oldDataset.length} annotations`);
187
+ }
188
+ /**
189
+ * Save batch annotation requests for OpenAI Batch API
190
+ */
191
+ async saveBatchAnnotateRequests(jsonSchema, documents, batchRequestsPath, options = {}) {
192
+ const { model = 'gpt-4o-mini', temperature = 0.0, modality = 'native', } = options;
193
+ const batchRequests = [];
194
+ for (let i = 0; i < documents.length; i++) {
195
+ const docPath = documents[i];
196
+ const systemMessage = this.createSystemMessage(jsonSchema, modality);
197
+ const userMessage = await this.createUserMessage(docPath, modality);
198
+ batchRequests.push({
199
+ custom_id: `doc_${i}_${path.basename(docPath, path.extname(docPath))}`,
200
+ method: 'POST',
201
+ url: '/v1/chat/completions',
202
+ body: {
203
+ model,
204
+ messages: [systemMessage, userMessage],
205
+ temperature,
206
+ response_format: { type: 'json_object' },
207
+ },
208
+ });
209
+ }
210
+ await writeJSONL(batchRequestsPath, batchRequests);
211
+ console.log(`✅ Saved ${batchRequests.length} batch requests to ${batchRequestsPath}`);
212
+ }
213
+ /**
214
+ * Build dataset from batch API results
215
+ */
216
+ async buildDatasetFromBatchResults(jsonSchema, batchResultsPath, datasetPath, modality = 'native') {
217
+ if (!fs.existsSync(batchResultsPath)) {
218
+ throw new Error(`Batch results file not found: ${batchResultsPath}`);
219
+ }
220
+ const batchResults = await readJSONL(batchResultsPath);
221
+ const finetuningData = [];
222
+ for (const result of batchResults) {
223
+ if (result.error) {
224
+ console.warn(`⚠️ Skipping failed request ${result.custom_id}: ${result.error.message}`);
225
+ continue;
226
+ }
227
+ const response = result.response.body;
228
+ const content = response.choices?.[0]?.message?.content;
229
+ if (!content) {
230
+ console.warn(`⚠️ No content in response for ${result.custom_id}`);
231
+ continue;
232
+ }
233
+ try {
234
+ const annotation = JSON.parse(content);
235
+ // Reconstruct messages (this is simplified)
236
+ const systemMessage = this.createSystemMessage(jsonSchema, modality);
237
+ // Extract user message from original request (would need to be stored)
238
+ const userMessage = {
239
+ role: 'user',
240
+ content: `Document content for ${result.custom_id}`,
241
+ };
242
+ const assistantMessage = {
243
+ role: 'assistant',
244
+ content: JSON.stringify(annotation),
245
+ };
246
+ finetuningData.push({
247
+ messages: [systemMessage, userMessage, assistantMessage],
248
+ });
249
+ }
250
+ catch (error) {
251
+ console.warn(`⚠️ Failed to parse annotation for ${result.custom_id}:`, error);
252
+ }
253
+ }
254
+ await writeJSONL(datasetPath, finetuningData);
255
+ console.log(`✅ Built dataset with ${finetuningData.length} examples from batch results`);
256
+ }
257
+ // Helper methods
258
+ createSystemMessage(jsonSchema, _modality) {
259
+ const schemaObj = typeof jsonSchema === 'string' ? JSON.parse(jsonSchema) : jsonSchema;
260
+ const schemaStr = JSON.stringify(schemaObj, null, 2);
261
+ return {
262
+ role: 'system',
263
+ content: `You are an expert data extraction assistant. Extract information from the provided document according to the following JSON schema:\n\n${schemaStr}\n\nReturn only valid JSON that matches the schema exactly.`,
264
+ };
265
+ }
266
+ async createUserMessage(docPath, _modality, _options = {}) {
267
+ // This is a simplified implementation
268
+ // In practice, you'd handle different file types, base64 encoding, etc.
269
+ const content = fs.readFileSync(docPath, 'utf-8');
270
+ return {
271
+ role: 'user',
272
+ content: `Please extract data from this document:\n\n${content}`,
273
+ };
274
+ }
275
+ async createMultiDocumentUserMessage(docPaths, _modality) {
276
+ const contents = docPaths.map((docPath, index) => {
277
+ const content = fs.readFileSync(docPath, 'utf-8');
278
+ return `Document ${index + 1} (${path.basename(docPath)}):\n${content}`;
279
+ }).join('\n\n---\n\n');
280
+ return {
281
+ role: 'user',
282
+ content: `Please extract data from these documents:\n\n${contents}`,
283
+ };
284
+ }
285
+ async generateAnnotation(_jsonSchema, _docPath, _model, _temperature, _modality, _reasoningEffort, _provider) {
286
+ // This would integrate with the actual AI providers
287
+ // For now, return a placeholder implementation
288
+ throw new Error('AI provider integration not implemented in this version');
289
+ }
290
+ async generateAnnotationFromUserMessage(_jsonSchema, _userContent, _options) {
291
+ // This would re-generate annotation from existing user message
292
+ throw new Error('Annotation update not implemented in this version');
293
+ }
294
+ }
295
+ export class Datasets extends SyncAPIResource {
296
+ constructor() {
297
+ super(...arguments);
298
+ this.mixin = new BaseDatasetsMixin();
299
+ }
300
+ async pprint(datasetPath, inputTokenPrice, outputTokenPrice) {
301
+ return this.mixin.pprint(datasetPath, inputTokenPrice, outputTokenPrice);
302
+ }
303
+ async save(jsonSchema, documentAnnotationPairsPaths, datasetPath, options) {
304
+ return this.mixin.save(jsonSchema, documentAnnotationPairsPaths, datasetPath, options);
305
+ }
306
+ async changeSchema(inputDatasetPath, jsonSchema, outputDatasetPath, inplace) {
307
+ return this.mixin.changeSchema(inputDatasetPath, jsonSchema, outputDatasetPath, inplace);
308
+ }
309
+ async stitchAndSave(jsonSchema, pairsPaths, datasetPath, modality) {
310
+ return this.mixin.stitchAndSave(jsonSchema, pairsPaths, datasetPath, modality);
311
+ }
312
+ async annotate(jsonSchema, documents, datasetPath, options) {
313
+ return this.mixin.annotate(jsonSchema, documents, datasetPath, options);
314
+ }
315
+ async updateAnnotations(jsonSchema, oldDatasetPath, newDatasetPath, options) {
316
+ return this.mixin.updateAnnotations(jsonSchema, oldDatasetPath, newDatasetPath, options);
317
+ }
318
+ async saveBatchAnnotateRequests(jsonSchema, documents, batchRequestsPath, options) {
319
+ return this.mixin.saveBatchAnnotateRequests(jsonSchema, documents, batchRequestsPath, options);
320
+ }
321
+ async buildDatasetFromBatchResults(jsonSchema, batchResultsPath, datasetPath, modality) {
322
+ return this.mixin.buildDatasetFromBatchResults(jsonSchema, batchResultsPath, datasetPath, modality);
323
+ }
324
+ }
325
+ export class AsyncDatasets extends AsyncAPIResource {
326
+ constructor() {
327
+ super(...arguments);
328
+ this.mixin = new BaseDatasetsMixin();
329
+ }
330
+ async pprint(datasetPath, inputTokenPrice, outputTokenPrice) {
331
+ return this.mixin.pprint(datasetPath, inputTokenPrice, outputTokenPrice);
332
+ }
333
+ async save(jsonSchema, documentAnnotationPairsPaths, datasetPath, options) {
334
+ return this.mixin.save(jsonSchema, documentAnnotationPairsPaths, datasetPath, options);
335
+ }
336
+ async changeSchema(inputDatasetPath, jsonSchema, outputDatasetPath, inplace) {
337
+ return this.mixin.changeSchema(inputDatasetPath, jsonSchema, outputDatasetPath, inplace);
338
+ }
339
+ async stitchAndSave(jsonSchema, pairsPaths, datasetPath, modality) {
340
+ return this.mixin.stitchAndSave(jsonSchema, pairsPaths, datasetPath, modality);
341
+ }
342
+ async annotate(jsonSchema, documents, datasetPath, options) {
343
+ return this.mixin.annotate(jsonSchema, documents, datasetPath, options);
344
+ }
345
+ async updateAnnotations(jsonSchema, oldDatasetPath, newDatasetPath, options) {
346
+ return this.mixin.updateAnnotations(jsonSchema, oldDatasetPath, newDatasetPath, options);
347
+ }
348
+ async saveBatchAnnotateRequests(jsonSchema, documents, batchRequestsPath, options) {
349
+ return this.mixin.saveBatchAnnotateRequests(jsonSchema, documents, batchRequestsPath, options);
350
+ }
351
+ async buildDatasetFromBatchResults(jsonSchema, batchResultsPath, datasetPath, modality) {
352
+ return this.mixin.buildDatasetFromBatchResults(jsonSchema, batchResultsPath, datasetPath, modality);
353
+ }
354
+ }
355
+ export default {
356
+ Datasets,
357
+ AsyncDatasets,
358
+ BaseDatasetsMixin,
359
+ };