@promptbook/markitdown 0.92.0-21 β†’ 0.92.0-23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. package/esm/index.es.js +136 -77
  2. package/esm/index.es.js.map +1 -1
  3. package/esm/typings/src/_packages/core.index.d.ts +6 -0
  4. package/esm/typings/src/_packages/types.index.d.ts +2 -0
  5. package/esm/typings/src/commands/FOREACH/ForeachJson.d.ts +6 -6
  6. package/esm/typings/src/config.d.ts +29 -11
  7. package/esm/typings/src/execution/createPipelineExecutor/10-executePipeline.d.ts +12 -9
  8. package/esm/typings/src/execution/createPipelineExecutor/20-executeTask.d.ts +11 -8
  9. package/esm/typings/src/execution/createPipelineExecutor/30-executeFormatSubvalues.d.ts +15 -3
  10. package/esm/typings/src/execution/createPipelineExecutor/getReservedParametersForTask.d.ts +10 -8
  11. package/esm/typings/src/formats/_common/FormatParser.d.ts +5 -3
  12. package/esm/typings/src/formats/_common/FormatSubvalueParser.d.ts +40 -5
  13. package/esm/typings/src/formats/csv/utils/isValidCsvString.d.ts +1 -1
  14. package/esm/typings/src/formats/json/utils/isValidJsonString.d.ts +1 -1
  15. package/esm/typings/src/formats/xml/utils/isValidXmlString.d.ts +1 -1
  16. package/esm/typings/src/llm-providers/_common/register/LlmToolsOptions.d.ts +4 -1
  17. package/esm/typings/src/scrapers/_common/register/$scrapersMetadataRegister.d.ts +3 -3
  18. package/esm/typings/src/types/typeAliases.d.ts +9 -7
  19. package/esm/typings/src/utils/$Register.d.ts +8 -7
  20. package/esm/typings/src/utils/parameters/mapAvailableToExpectedParameters.d.ts +7 -7
  21. package/esm/typings/src/utils/serialization/clonePipeline.d.ts +4 -3
  22. package/esm/typings/src/utils/serialization/deepClone.d.ts +5 -1
  23. package/esm/typings/src/utils/validators/javascriptName/isValidJavascriptName.d.ts +3 -3
  24. package/esm/typings/src/utils/validators/parameterName/validateParameterName.d.ts +5 -4
  25. package/package.json +2 -2
  26. package/umd/index.umd.js +136 -77
  27. package/umd/index.umd.js.map +1 -1
package/esm/index.es.js CHANGED
@@ -26,7 +26,7 @@ const BOOK_LANGUAGE_VERSION = '1.0.0';
26
26
  * @generated
27
27
  * @see https://github.com/webgptorg/promptbook
28
28
  */
29
- const PROMPTBOOK_ENGINE_VERSION = '0.92.0-21';
29
+ const PROMPTBOOK_ENGINE_VERSION = '0.92.0-23';
30
30
  /**
31
31
  * TODO: string_promptbook_version should be constrained to the all versions of Promptbook engine
32
32
  * Note: [πŸ’ž] Ignore a discrepancy between file name and entity name
@@ -102,6 +102,12 @@ const DEFAULT_BOOK_TITLE = `✨ Untitled Book`;
102
102
  * @public exported from `@promptbook/core`
103
103
  */
104
104
  const DEFAULT_MAX_FILE_SIZE = 100 * 1024 * 1024; // 100MB
105
+ /**
106
+ * @@@
107
+ *
108
+ * @public exported from `@promptbook/core`
109
+ */
110
+ const BIG_DATASET_TRESHOLD = 50;
105
111
  // <- TODO: [🧠] Better system for generator warnings - not always "code" and "by `@promptbook/cli`"
106
112
  /**
107
113
  * The maximum number of iterations for a loops
@@ -181,7 +187,7 @@ const DEFAULT_DOWNLOAD_CACHE_DIRNAME = './.promptbook/download-cache';
181
187
  const DEFAULT_SCRAPE_CACHE_DIRNAME = './.promptbook/scrape-cache';
182
188
  // <- TODO: [πŸ§œβ€β™‚οΈ]
183
189
  /**
184
- * @@@
190
+ * Default settings for parsing and generating CSV files in Promptbook.
185
191
  *
186
192
  * @public exported from `@promptbook/core`
187
193
  */
@@ -192,19 +198,19 @@ const DEFAULT_CSV_SETTINGS = Object.freeze({
192
198
  skipEmptyLines: true,
193
199
  });
194
200
  /**
195
- * @@@
201
+ * Controls whether verbose logging is enabled by default throughout the application.
196
202
  *
197
203
  * @public exported from `@promptbook/core`
198
204
  */
199
205
  let DEFAULT_IS_VERBOSE = false;
200
206
  /**
201
- * @@@
207
+ * Controls whether auto-installation of dependencies is enabled by default.
202
208
  *
203
209
  * @public exported from `@promptbook/core`
204
210
  */
205
211
  const DEFAULT_IS_AUTO_INSTALLED = false;
206
212
  /**
207
- * @@@
213
+ * Indicates whether pipeline logic validation is enabled. When true, the pipeline logic is checked for consistency.
208
214
  *
209
215
  * @private within the repository
210
216
  */
@@ -932,7 +938,7 @@ function assertsError(whatWasThrown) {
932
938
  * Function isValidJsonString will tell you if the string is valid JSON or not
933
939
  *
934
940
  * @param value The string to check
935
- * @returns True if the string is a valid JSON string, false otherwise
941
+ * @returns `true` if the string is a valid JSON string, false otherwise
936
942
  *
937
943
  * @public exported from `@promptbook/utils`
938
944
  */
@@ -1343,8 +1349,12 @@ function checkSerializableAsJson(options) {
1343
1349
  */
1344
1350
 
1345
1351
  /**
1346
- * @@@
1352
+ * Creates a deep clone of the given object
1353
+ *
1354
+ * Note: This method only works for objects that are fully serializable to JSON and do not contain functions, Dates, or special types.
1347
1355
  *
1356
+ * @param objectValue The object to clone.
1357
+ * @returns A deep, writable clone of the input object.
1348
1358
  * @public exported from `@promptbook/utils`
1349
1359
  */
1350
1360
  function deepClone(objectValue) {
@@ -3147,11 +3157,11 @@ function normalizeTo_snake_case(text) {
3147
3157
  }
3148
3158
 
3149
3159
  /**
3150
- * Register is @@@
3160
+ * Global registry for storing and managing registered entities of a given type.
3151
3161
  *
3152
3162
  * Note: `$` is used to indicate that this function is not a pure function - it accesses and adds variables in global scope.
3153
3163
  *
3154
- * @private internal utility, exported are only signleton instances of this class
3164
+ * @private internal utility, exported are only singleton instances of this class
3155
3165
  */
3156
3166
  class $Register {
3157
3167
  constructor(registerName) {
@@ -3195,10 +3205,10 @@ class $Register {
3195
3205
  }
3196
3206
 
3197
3207
  /**
3198
- * @@@
3208
+ * Global registry for storing metadata about all available scrapers and converters.
3199
3209
  *
3200
- * Note: `$` is used to indicate that this interacts with the global scope
3201
- * @singleton Only one instance of each register is created per build, but thare can be more @@@
3210
+ * Note: `$` is used to indicate that this interacts with the global scope.
3211
+ * @singleton Only one instance of each register is created per build, but there can be more in different contexts (e.g., tests).
3202
3212
  * @public exported from `@promptbook/core`
3203
3213
  */
3204
3214
  const $scrapersMetadataRegister = new $Register('scrapers_metadata');
@@ -4099,7 +4109,7 @@ function csvParse(value /* <- TODO: string_csv */, settings, schema /* <- TODO:
4099
4109
  * Function to check if a string is valid CSV
4100
4110
  *
4101
4111
  * @param value The string to check
4102
- * @returns True if the string is a valid CSV string, false otherwise
4112
+ * @returns `true` if the string is a valid CSV string, false otherwise
4103
4113
  *
4104
4114
  * @public exported from `@promptbook/utils`
4105
4115
  */
@@ -4138,7 +4148,8 @@ const CsvFormatParser = {
4138
4148
  subvalueParsers: [
4139
4149
  {
4140
4150
  subvalueName: 'ROW',
4141
- async mapValues(value, outputParameterName, settings, mapCallback) {
4151
+ async mapValues(options) {
4152
+ const { value, outputParameterName, settings, mapCallback, onProgress } = options;
4142
4153
  const csv = csvParse(value, settings);
4143
4154
  if (csv.errors.length !== 0) {
4144
4155
  throw new CsvFormatError(spaceTrim((block) => `
@@ -4154,21 +4165,30 @@ const CsvFormatParser = {
4154
4165
  ${block(value)}
4155
4166
  `));
4156
4167
  }
4157
- const mappedData = await Promise.all(csv.data.map(async (row, index) => {
4168
+ const mappedData = [];
4169
+ const length = csv.data.length;
4170
+ for (let index = 0; index < length; index++) {
4171
+ const row = csv.data[index];
4158
4172
  if (row[outputParameterName]) {
4159
4173
  throw new CsvFormatError(`Can not overwrite existing column "${outputParameterName}" in CSV row`);
4160
4174
  }
4161
- return {
4175
+ const mappedRow = {
4162
4176
  ...row,
4163
- [outputParameterName]: await mapCallback(row, index),
4177
+ [outputParameterName]: await mapCallback(row, index, length),
4164
4178
  };
4165
- }));
4179
+ mappedData.push(mappedRow);
4180
+ if (onProgress) {
4181
+ // Note: Report the CSV with all rows mapped so far
4182
+ await onProgress(unparse(mappedData, { ...settings, ...MANDATORY_CSV_SETTINGS }));
4183
+ }
4184
+ }
4166
4185
  return unparse(mappedData, { ...settings, ...MANDATORY_CSV_SETTINGS });
4167
4186
  },
4168
4187
  },
4169
4188
  {
4170
4189
  subvalueName: 'CELL',
4171
- async mapValues(value, outputParameterName, settings, mapCallback) {
4190
+ async mapValues(options) {
4191
+ const { value, settings, mapCallback, onProgress } = options;
4172
4192
  const csv = csvParse(value, settings);
4173
4193
  if (csv.errors.length !== 0) {
4174
4194
  throw new CsvFormatError(spaceTrim((block) => `
@@ -4185,9 +4205,9 @@ const CsvFormatParser = {
4185
4205
  `));
4186
4206
  }
4187
4207
  const mappedData = await Promise.all(csv.data.map(async (row, rowIndex) => {
4188
- return /* not await */ Promise.all(Object.entries(row).map(async ([key, value], columnIndex) => {
4208
+ return /* not await */ Promise.all(Object.entries(row).map(async ([key, value], columnIndex, array) => {
4189
4209
  const index = rowIndex * Object.keys(row).length + columnIndex;
4190
- return /* not await */ mapCallback({ [key]: value }, index);
4210
+ return /* not await */ mapCallback({ [key]: value }, index, array.length);
4191
4211
  }));
4192
4212
  }));
4193
4213
  return unparse(mappedData, { ...settings, ...MANDATORY_CSV_SETTINGS });
@@ -4255,14 +4275,15 @@ const TextFormatParser = {
4255
4275
  subvalueParsers: [
4256
4276
  {
4257
4277
  subvalueName: 'LINE',
4258
- async mapValues(value, outputParameterName, settings, mapCallback) {
4278
+ async mapValues(options) {
4279
+ const { value, mapCallback, onProgress } = options;
4259
4280
  const lines = value.split('\n');
4260
- const mappedLines = await Promise.all(lines.map((lineContent, lineNumber) =>
4281
+ const mappedLines = await Promise.all(lines.map((lineContent, lineNumber, array) =>
4261
4282
  // TODO: [🧠] Maybe option to skip empty line
4262
4283
  /* not await */ mapCallback({
4263
4284
  lineContent,
4264
4285
  // TODO: [🧠] Maybe also put here `lineNumber`
4265
- }, lineNumber)));
4286
+ }, lineNumber, array.length)));
4266
4287
  return mappedLines.join('\n');
4267
4288
  },
4268
4289
  },
@@ -4283,7 +4304,7 @@ const TextFormatParser = {
4283
4304
  * Function to check if a string is valid XML
4284
4305
  *
4285
4306
  * @param value
4286
- * @returns True if the string is a valid XML string, false otherwise
4307
+ * @returns `true` if the string is a valid XML string, false otherwise
4287
4308
  *
4288
4309
  * @public exported from `@promptbook/utils`
4289
4310
  */
@@ -4345,13 +4366,13 @@ const FORMAT_DEFINITIONS = [JsonFormatParser, XmlFormatParser, TextFormatParser,
4345
4366
  */
4346
4367
 
4347
4368
  /**
4348
- * Maps available parameters to expected parameters
4369
+ * Maps available parameters to expected parameters for a pipeline task.
4349
4370
  *
4350
4371
  * The strategy is:
4351
- * 1) @@@
4352
- * 2) @@@
4372
+ * 1) First, match parameters by name where both available and expected.
4373
+ * 2) Then, if there are unmatched expected and available parameters, map them by order.
4353
4374
  *
4354
- * @throws {PipelineExecutionError} @@@
4375
+ * @throws {PipelineExecutionError} If the number of unmatched expected and available parameters does not match, or mapping is ambiguous.
4355
4376
  * @private within the repository used in `createPipelineExecutor`
4356
4377
  */
4357
4378
  function mapAvailableToExpectedParameters(options) {
@@ -5071,12 +5092,16 @@ async function executeAttempts(options) {
5071
5092
  */
5072
5093
 
5073
5094
  /**
5074
- * @@@
5095
+ * Executes a pipeline task that requires mapping or iterating over subvalues of a parameter (such as rows in a CSV).
5096
+ * Handles format and subformat resolution, error handling, and progress reporting.
5097
+ *
5098
+ * @param options - Options for execution, including task details and progress callback.
5099
+ * @returns The result of the subvalue mapping or execution attempts.
5075
5100
  *
5076
5101
  * @private internal utility of `createPipelineExecutor`
5077
5102
  */
5078
5103
  async function executeFormatSubvalues(options) {
5079
- const { task, jokerParameterNames, parameters, priority, csvSettings, pipelineIdentification } = options;
5104
+ const { task, jokerParameterNames, parameters, priority, csvSettings, onProgress, pipelineIdentification } = options;
5080
5105
  if (task.foreach === undefined) {
5081
5106
  return /* not await */ executeAttempts(options);
5082
5107
  }
@@ -5130,46 +5155,74 @@ async function executeFormatSubvalues(options) {
5130
5155
  formatSettings = csvSettings;
5131
5156
  // <- TODO: [πŸ€Ήβ€β™‚οΈ] More universal, make simmilar pattern for other formats for example \n vs \r\n in text
5132
5157
  }
5133
- const resultString = await subvalueParser.mapValues(parameterValue, task.foreach.outputSubparameterName, formatSettings, async (subparameters, index) => {
5134
- let mappedParameters;
5135
- // TODO: [πŸ€Ήβ€β™‚οΈ][πŸͺ‚] Limit to N concurrent executions
5136
- // TODO: When done [🐚] Report progress also for each subvalue here
5137
- try {
5138
- mappedParameters = mapAvailableToExpectedParameters({
5139
- expectedParameters: Object.fromEntries(task.foreach.inputSubparameterNames.map((subparameterName) => [subparameterName, null])),
5140
- availableParameters: subparameters,
5141
- });
5142
- }
5143
- catch (error) {
5144
- if (!(error instanceof PipelineExecutionError)) {
5145
- throw error;
5158
+ const resultString = await subvalueParser.mapValues({
5159
+ value: parameterValue,
5160
+ outputParameterName: task.foreach.outputSubparameterName,
5161
+ settings: formatSettings,
5162
+ onProgress(partialResultString) {
5163
+ return onProgress(Object.freeze({
5164
+ [task.resultingParameterName]: partialResultString,
5165
+ }));
5166
+ },
5167
+ async mapCallback(subparameters, index, length) {
5168
+ let mappedParameters;
5169
+ try {
5170
+ mappedParameters = mapAvailableToExpectedParameters({
5171
+ expectedParameters: Object.fromEntries(task.foreach.inputSubparameterNames.map((subparameterName) => [subparameterName, null])),
5172
+ availableParameters: subparameters,
5173
+ });
5146
5174
  }
5147
- throw new PipelineExecutionError(spaceTrim((block) => `
5148
- ${error.message}
5175
+ catch (error) {
5176
+ if (!(error instanceof PipelineExecutionError)) {
5177
+ throw error;
5178
+ }
5179
+ const highLevelError = new PipelineExecutionError(spaceTrim((block) => `
5180
+ ${error.message}
5149
5181
 
5150
- This is error in FOREACH command
5151
- You have probbably passed wrong data to pipeline or wrong data was generated which are processed by FOREACH command
5182
+ This is error in FOREACH command when mapping data
5183
+ You have probbably passed wrong data to pipeline or wrong data was generated which are processed by FOREACH command
5152
5184
 
5153
- ${block(pipelineIdentification)}
5154
- Subparameter index: ${index}
5155
- `));
5156
- }
5157
- const allSubparameters = {
5158
- ...parameters,
5159
- ...mappedParameters,
5160
- };
5161
- // Note: [πŸ‘¨β€πŸ‘¨β€πŸ‘§] Now we can freeze `subparameters` because we are sure that all and only used parameters are defined and are not going to be changed
5162
- Object.freeze(allSubparameters);
5163
- const subresultString = await executeAttempts({
5164
- ...options,
5165
- priority: priority + index,
5166
- parameters: allSubparameters,
5167
- pipelineIdentification: spaceTrim((block) => `
5168
- ${block(pipelineIdentification)}
5169
- Subparameter index: ${index}
5170
- `),
5171
- });
5172
- return subresultString;
5185
+ ${block(pipelineIdentification)}
5186
+ Subparameter index: ${index}
5187
+ `));
5188
+ if (length > BIG_DATASET_TRESHOLD) {
5189
+ console.error(highLevelError);
5190
+ return '~';
5191
+ }
5192
+ throw highLevelError;
5193
+ }
5194
+ const allSubparameters = {
5195
+ ...parameters,
5196
+ ...mappedParameters,
5197
+ };
5198
+ Object.freeze(allSubparameters);
5199
+ try {
5200
+ const subresultString = await executeAttempts({
5201
+ ...options,
5202
+ priority: priority + index,
5203
+ parameters: allSubparameters,
5204
+ pipelineIdentification: spaceTrim((block) => `
5205
+ ${block(pipelineIdentification)}
5206
+ Subparameter index: ${index}
5207
+ `),
5208
+ });
5209
+ return subresultString;
5210
+ }
5211
+ catch (error) {
5212
+ if (length > BIG_DATASET_TRESHOLD) {
5213
+ console.error(spaceTrim((block) => `
5214
+ Error in FOREACH command:
5215
+
5216
+ ${block(pipelineIdentification)}
5217
+
5218
+ ${block(pipelineIdentification)}
5219
+ Subparameter index: ${index}
5220
+ `));
5221
+ return '~';
5222
+ }
5223
+ throw error;
5224
+ }
5225
+ },
5173
5226
  });
5174
5227
  return resultString;
5175
5228
  }
@@ -5303,7 +5356,11 @@ async function getKnowledgeForTask(options) {
5303
5356
  */
5304
5357
 
5305
5358
  /**
5306
- * @@@
5359
+ * Retrieves all reserved parameters for a given pipeline task, including context, knowledge, examples, and metadata.
5360
+ * Ensures all reserved parameters are defined and throws if any are missing.
5361
+ *
5362
+ * @param options - Options including tools, pipeline, task, and context.
5363
+ * @returns An object containing all reserved parameters for the task.
5307
5364
  *
5308
5365
  * @private internal utility of `createPipelineExecutor`
5309
5366
  */
@@ -5336,18 +5393,16 @@ async function getReservedParametersForTask(options) {
5336
5393
  }
5337
5394
 
5338
5395
  /**
5339
- * @@@
5396
+ * Executes a single task within a pipeline, handling parameter validation, error checking, and progress reporting.
5397
+ *
5398
+ * @param options - Options for execution, including the task, pipeline, parameters, and callbacks.
5399
+ * @returns The output parameters produced by the task.
5340
5400
  *
5341
5401
  * @private internal utility of `createPipelineExecutor`
5342
5402
  */
5343
5403
  async function executeTask(options) {
5344
5404
  const { currentTask, preparedPipeline, parametersToPass, tools, onProgress, $executionReport, pipelineIdentification, maxExecutionAttempts, maxParallelCount, csvSettings, isVerbose, rootDirname, cacheDirname, intermediateFilesStrategy, isAutoInstalled, isNotPreparedWarningSupressed, } = options;
5345
5405
  const priority = preparedPipeline.tasks.length - preparedPipeline.tasks.indexOf(currentTask);
5346
- await onProgress({
5347
- outputParameters: {
5348
- [currentTask.resultingParameterName]: '', // <- TODO: [🧠] What is the best value here?
5349
- },
5350
- });
5351
5406
  // Note: Check consistency of used and dependent parameters which was also done in `validatePipeline`, but it’s good to doublecheck
5352
5407
  const usedParameterNames = extractParameterNamesFromTask(currentTask);
5353
5408
  const dependentParameterNames = new Set(currentTask.dependentParameterNames);
@@ -5422,6 +5477,7 @@ async function executeTask(options) {
5422
5477
  preparedPipeline,
5423
5478
  tools,
5424
5479
  $executionReport,
5480
+ onProgress,
5425
5481
  pipelineIdentification,
5426
5482
  maxExecutionAttempts,
5427
5483
  maxParallelCount,
@@ -5474,9 +5530,12 @@ function filterJustOutputParameters(options) {
5474
5530
  }
5475
5531
 
5476
5532
  /**
5477
- * @@@
5533
+ * Executes an entire pipeline, resolving tasks in dependency order, handling errors, and reporting progress.
5534
+ *
5535
+ * Note: This is not a `PipelineExecutor` (which is bound to a single pipeline), but a utility function used by `createPipelineExecutor` to create a `PipelineExecutor`.
5478
5536
  *
5479
- * Note: This is not a `PipelineExecutor` (which is binded with one exact pipeline), but a utility function of `createPipelineExecutor` which creates `PipelineExecutor`
5537
+ * @param options - Options for execution, including input parameters, pipeline, and callbacks.
5538
+ * @returns The result of the pipeline execution, including output parameters, errors, and usage statistics.
5480
5539
  *
5481
5540
  * @private internal utility of `createPipelineExecutor`
5482
5541
  */