@yeyuan98/opencode-bioresearcher-plugin 1.5.0-alpha.0 → 1.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. package/README.md +48 -36
  2. package/dist/index.js +8 -6
  3. package/dist/skills/bioresearcher-tests/README.md +90 -0
  4. package/dist/skills/bioresearcher-tests/SKILL.md +255 -0
  5. package/dist/skills/bioresearcher-tests/pyproject.toml +6 -0
  6. package/dist/skills/bioresearcher-tests/resources/json_samples/in_markdown.md.gz +0 -0
  7. package/dist/skills/bioresearcher-tests/resources/json_samples/nested_object.json.gz +0 -0
  8. package/dist/skills/bioresearcher-tests/resources/json_samples/schema_draft7.json.gz +0 -0
  9. package/dist/skills/bioresearcher-tests/resources/json_samples/simple_array.json.gz +0 -0
  10. package/dist/skills/bioresearcher-tests/resources/json_samples/simple_object.json.gz +0 -0
  11. package/dist/skills/bioresearcher-tests/resources/obo_sample.obo.gz +0 -0
  12. package/dist/skills/bioresearcher-tests/resources/pubmed_sample.xml.gz +0 -0
  13. package/dist/skills/bioresearcher-tests/resources/table_sample.xlsx.gz +0 -0
  14. package/dist/skills/bioresearcher-tests/test_cases/json_tests.md +137 -0
  15. package/dist/skills/bioresearcher-tests/test_cases/misc_tests.md +141 -0
  16. package/dist/skills/bioresearcher-tests/test_cases/parser_tests.md +80 -0
  17. package/dist/skills/bioresearcher-tests/test_cases/skill_tests.md +59 -0
  18. package/dist/skills/bioresearcher-tests/test_cases/table_tests.md +194 -0
  19. package/dist/skills/bioresearcher-tests/test_runner.py +607 -0
  20. package/dist/skills/env-jsonc-setup/SKILL.md +206 -206
  21. package/dist/skills/long-table-summary/SKILL.md +224 -153
  22. package/dist/skills/long-table-summary/combine_outputs.py +55 -9
  23. package/dist/skills/long-table-summary/generate_prompts.py +9 -0
  24. package/dist/skills/pubmed-weekly/pubmed_weekly.py +130 -29
  25. package/dist/{db-tools → tools/db}/backends/mysql/translator.js +23 -23
  26. package/dist/{db-tools → tools/db}/tools.js +34 -34
  27. package/dist/{misc-tools → tools/misc}/json-validate.js +4 -5
  28. package/dist/tools/parser/obo/index.d.ts +2 -0
  29. package/dist/tools/parser/obo/index.js +2 -0
  30. package/dist/tools/parser/obo/obo.d.ts +17 -0
  31. package/dist/tools/parser/obo/obo.js +216 -0
  32. package/dist/tools/parser/obo/types.d.ts +166 -0
  33. package/dist/tools/parser/obo/utils.d.ts +21 -0
  34. package/dist/tools/parser/obo/utils.js +411 -0
  35. package/dist/tools/parser/pubmed/types.js +1 -0
  36. package/dist/{skill-tools → tools/skill}/registry.js +1 -1
  37. package/package.json +1 -1
  38. package/dist/db-tools/executor.d.ts +0 -13
  39. package/dist/db-tools/executor.js +0 -54
  40. package/dist/db-tools/pool.d.ts +0 -8
  41. package/dist/db-tools/pool.js +0 -49
  42. package/dist/db-tools/tools/index.d.ts +0 -27
  43. package/dist/db-tools/tools/index.js +0 -191
  44. package/dist/db-tools/types.d.ts +0 -94
  45. package/dist/db-tools/types.js +0 -40
  46. package/dist/misc-tools/json-tools.d.ts +0 -33
  47. package/dist/misc-tools/json-tools.js +0 -187
  48. package/dist/skill/frontmatter.d.ts +0 -2
  49. package/dist/skill/frontmatter.js +0 -65
  50. package/dist/skill/index.d.ts +0 -3
  51. package/dist/skill/index.js +0 -2
  52. package/dist/skill/registry.d.ts +0 -11
  53. package/dist/skill/registry.js +0 -64
  54. package/dist/skill/tool.d.ts +0 -9
  55. package/dist/skill/tool.js +0 -115
  56. package/dist/skill/types.d.ts +0 -22
  57. package/dist/skill/types.js +0 -7
  58. /package/dist/{db-tools → tools/db}/backends/index.d.ts +0 -0
  59. /package/dist/{db-tools → tools/db}/backends/index.js +0 -0
  60. /package/dist/{db-tools → tools/db}/backends/mongodb/backend.d.ts +0 -0
  61. /package/dist/{db-tools → tools/db}/backends/mongodb/backend.js +0 -0
  62. /package/dist/{db-tools → tools/db}/backends/mongodb/connection.d.ts +0 -0
  63. /package/dist/{db-tools → tools/db}/backends/mongodb/connection.js +0 -0
  64. /package/dist/{db-tools → tools/db}/backends/mongodb/index.d.ts +0 -0
  65. /package/dist/{db-tools → tools/db}/backends/mongodb/index.js +0 -0
  66. /package/dist/{db-tools → tools/db}/backends/mongodb/translator.d.ts +0 -0
  67. /package/dist/{db-tools → tools/db}/backends/mongodb/translator.js +0 -0
  68. /package/dist/{db-tools → tools/db}/backends/mysql/backend.d.ts +0 -0
  69. /package/dist/{db-tools → tools/db}/backends/mysql/backend.js +0 -0
  70. /package/dist/{db-tools → tools/db}/backends/mysql/connection.d.ts +0 -0
  71. /package/dist/{db-tools → tools/db}/backends/mysql/connection.js +0 -0
  72. /package/dist/{db-tools → tools/db}/backends/mysql/index.d.ts +0 -0
  73. /package/dist/{db-tools → tools/db}/backends/mysql/index.js +0 -0
  74. /package/dist/{db-tools → tools/db}/backends/mysql/translator.d.ts +0 -0
  75. /package/dist/{db-tools → tools/db}/core/base.d.ts +0 -0
  76. /package/dist/{db-tools → tools/db}/core/base.js +0 -0
  77. /package/dist/{db-tools → tools/db}/core/config-loader.d.ts +0 -0
  78. /package/dist/{db-tools → tools/db}/core/config-loader.js +0 -0
  79. /package/dist/{db-tools → tools/db}/core/index.d.ts +0 -0
  80. /package/dist/{db-tools → tools/db}/core/index.js +0 -0
  81. /package/dist/{db-tools → tools/db}/core/jsonc-parser.d.ts +0 -0
  82. /package/dist/{db-tools → tools/db}/core/jsonc-parser.js +0 -0
  83. /package/dist/{db-tools → tools/db}/core/validator.d.ts +0 -0
  84. /package/dist/{db-tools → tools/db}/core/validator.js +0 -0
  85. /package/dist/{db-tools → tools/db}/index.d.ts +0 -0
  86. /package/dist/{db-tools → tools/db}/index.js +0 -0
  87. /package/dist/{db-tools → tools/db}/interface/backend.d.ts +0 -0
  88. /package/dist/{db-tools → tools/db}/interface/backend.js +0 -0
  89. /package/dist/{db-tools → tools/db}/interface/connection.d.ts +0 -0
  90. /package/dist/{db-tools → tools/db}/interface/connection.js +0 -0
  91. /package/dist/{db-tools → tools/db}/interface/index.d.ts +0 -0
  92. /package/dist/{db-tools → tools/db}/interface/index.js +0 -0
  93. /package/dist/{db-tools → tools/db}/interface/query.d.ts +0 -0
  94. /package/dist/{db-tools → tools/db}/interface/query.js +0 -0
  95. /package/dist/{db-tools → tools/db}/interface/schema.d.ts +0 -0
  96. /package/dist/{db-tools → tools/db}/interface/schema.js +0 -0
  97. /package/dist/{db-tools → tools/db}/tools.d.ts +0 -0
  98. /package/dist/{db-tools → tools/db}/utils.d.ts +0 -0
  99. /package/dist/{db-tools → tools/db}/utils.js +0 -0
  100. /package/dist/{misc-tools → tools/misc}/calculator.d.ts +0 -0
  101. /package/dist/{misc-tools → tools/misc}/calculator.js +0 -0
  102. /package/dist/{misc-tools → tools/misc}/index.d.ts +0 -0
  103. /package/dist/{misc-tools → tools/misc}/index.js +0 -0
  104. /package/dist/{misc-tools → tools/misc}/json-extract.d.ts +0 -0
  105. /package/dist/{misc-tools → tools/misc}/json-extract.js +0 -0
  106. /package/dist/{misc-tools → tools/misc}/json-infer.d.ts +0 -0
  107. /package/dist/{misc-tools → tools/misc}/json-infer.js +0 -0
  108. /package/dist/{misc-tools → tools/misc}/json-validate.d.ts +0 -0
  109. /package/dist/{misc-tools → tools/misc}/timer.d.ts +0 -0
  110. /package/dist/{misc-tools → tools/misc}/timer.js +0 -0
  111. /package/dist/{parser-tools/pubmed → tools/parser/obo}/types.js +0 -0
  112. /package/dist/{parser-tools → tools/parser}/pubmed/index.d.ts +0 -0
  113. /package/dist/{parser-tools → tools/parser}/pubmed/index.js +0 -0
  114. /package/dist/{parser-tools → tools/parser}/pubmed/pubmed.d.ts +0 -0
  115. /package/dist/{parser-tools → tools/parser}/pubmed/pubmed.js +0 -0
  116. /package/dist/{parser-tools → tools/parser}/pubmed/types.d.ts +0 -0
  117. /package/dist/{parser-tools → tools/parser}/pubmed/utils.d.ts +0 -0
  118. /package/dist/{parser-tools → tools/parser}/pubmed/utils.js +0 -0
  119. /package/dist/{skill-tools → tools/skill}/frontmatter.d.ts +0 -0
  120. /package/dist/{skill-tools → tools/skill}/frontmatter.js +0 -0
  121. /package/dist/{skill-tools → tools/skill}/index.d.ts +0 -0
  122. /package/dist/{skill-tools → tools/skill}/index.js +0 -0
  123. /package/dist/{skill-tools → tools/skill}/registry.d.ts +0 -0
  124. /package/dist/{skill-tools → tools/skill}/tool.d.ts +0 -0
  125. /package/dist/{skill-tools → tools/skill}/tool.js +0 -0
  126. /package/dist/{skill-tools → tools/skill}/types.d.ts +0 -0
  127. /package/dist/{skill-tools → tools/skill}/types.js +0 -0
  128. /package/dist/{table-tools → tools/table}/index.d.ts +0 -0
  129. /package/dist/{table-tools → tools/table}/index.js +0 -0
  130. /package/dist/{table-tools → tools/table}/tools.d.ts +0 -0
  131. /package/dist/{table-tools → tools/table}/tools.js +0 -0
  132. /package/dist/{table-tools → tools/table}/utils.d.ts +0 -0
  133. /package/dist/{table-tools → tools/table}/utils.js +0 -0
@@ -1,17 +1,19 @@
1
1
  ---
2
2
  name: long-table-summary
3
3
  description: Batch-process large tables using parallel subagents for summarization
4
- allowedTools:
5
- - Bash
6
- - Read
7
- - Write
8
- - Question
9
- - Task
10
- - tableListSheets
11
- - tableGetSheetPreview
12
- - tableGetHeaders
13
- - tableGetRange
14
- ---
4
+ allowedTools:
5
+ - Bash
6
+ - Read
7
+ - Write
8
+ - Question
9
+ - Task
10
+ - tableListSheets
11
+ - tableGetSheetPreview
12
+ - tableGetHeaders
13
+ - tableGetRange
14
+ - jsonValidate
15
+ - jsonInfer
16
+ ---
15
17
 
16
18
  # Long Table Summary
17
19
 
@@ -110,9 +112,70 @@ If user selects "Yes, I want to modify":
110
112
  - Update the JSON accordingly
111
113
  - Repeat the approval question
112
114
 
113
- Continue until user explicitly confirms that the instruction JSON is correct.
114
-
115
- ### Step 7: Autogenerate Topic Name
115
+ Continue until user explicitly confirms that the instruction JSON is correct.
116
+
117
+ ### Step 6.5: Generate Output JSON Schema
118
+
119
+ Generate a JSON Schema that defines the exact output structure. All fields are required.
120
+
121
+ **Default value for unavailable data:** Use `"NA"` (string) for any field where data cannot be extracted.
122
+
123
+ **Construct example output:**
124
+
125
+ 1. Start with the base structure:
126
+ ```json
127
+ {
128
+ "batch_number": 1,
129
+ "row_count": 30,
130
+ "summaries": [
131
+ {
132
+ "row_number": 2
133
+ }
134
+ ]
135
+ }
136
+ ```
137
+
138
+ 2. Add each user-specified field with an example value (use `"NA"` if the field might be empty):
139
+
140
+ For example, if user provided:
141
+ ```json
142
+ {
143
+ "species": "Species classification: Tier1/Tier2/NA",
144
+ "topic": "Main topic: Oncology/Immunology/Other"
145
+ }
146
+ ```
147
+
148
+ Construct the example output:
149
+ ```json
150
+ {
151
+ "batch_number": 1,
152
+ "row_count": 30,
153
+ "summaries": [
154
+ {
155
+ "row_number": 2,
156
+ "species": "Tier1",
157
+ "topic": "Oncology"
158
+ }
159
+ ]
160
+ }
161
+ ```
162
+
163
+ **Generate schema with strict mode:**
164
+
165
+ ```typescript
166
+ jsonInfer data='<example_output_json>' strict=true
167
+ ```
168
+
169
+ **Save the returned schema to:**
170
+
171
+ ```bash
172
+ Write file: .long-table-summary/{topic}/schema.json
173
+ Content: <schema_from_jsonInfer>
174
+ ```
175
+
176
+ This schema file will be used by all subagents to validate their outputs before writing.
177
+
178
+ ### Step 7: Autogenerate Topic Name
116
179
 
117
180
  Generate the topic name by combining:
118
181
  - Base filename (without extension)
@@ -153,61 +216,77 @@ Example for 90 rows with 30 per batch:
153
216
 
154
217
  **Note:** Row 1 is the header, data starts at row 2.
155
218
 
156
- ### Step 10: Create Subagent Prompt Template
157
-
158
- Create a template with `{placeholder}` format (single braces):
159
-
160
- ```markdown
161
- # Batch Data Summarization Task
162
-
163
- ## Input File
164
- - Full path: `{file_path}`
165
- - Sheet name: `{sheet_name}`
166
-
167
- ## Row Range
168
- - Batch number: {batch_number}
169
- - Start row: {row_start}
170
- - End row: {row_end}
171
-
172
- ## Summarization Instructions
173
-
174
- Extract the following fields from each row:
175
-
176
- {instructions_json}
177
-
178
- ## Output Format
179
-
180
- Your output must be a valid JSON file with this structure:
181
-
182
- ```json
183
- {
184
- "batch_number": {batch_number},
185
- "row_count": <number_of_rows_processed>,
186
- "summaries": [
187
- {
188
- "row_number": <row_number>,
189
- <field_1>: "<extracted_value>",
190
- <field_2>: "<extracted_value>"
191
- }
192
- ]
193
- }
194
- ```
195
-
196
- **Important:** The JSON keys for extracted values must match the field names specified in the Summarization Instructions.
197
-
198
- ## Instructions
199
-
200
- 1. Read the specified row range from the input file using the `tableGetRange` tool
201
- 2. For each row, extract the requested fields according to the instructions above
202
- 3. Map the extracted values to the JSON keys specified in the instructions
203
- 4. Generate concise summaries based on the extracted data
204
- 5. Save your output to: `{output_file}`
205
-
206
- ## Output File Path
207
- Full path: `{output_file}`
208
-
209
- **CRITICAL:** Write your final output as a markdown file (.md) containing ONLY the JSON object (no additional text or explanation).
210
- ```
219
+ ### Step 10: Create Subagent Prompt Template
220
+
221
+ Create a template with `{placeholder}` format (single braces):
222
+
223
+ ```markdown
224
+ # Batch Data Summarization Task
225
+
226
+ ## Input File
227
+ - Path: `{file_path}`
228
+ - Sheet: `{sheet_name}`
229
+
230
+ ## Row Range
231
+ - Batch: {batch_number}
232
+ - Rows: {row_start} to {row_end}
233
+
234
+ ## Summarization Instructions
235
+
236
+ For each row, extract these fields:
237
+
238
+ {instructions_json}
239
+
240
+ **Default for unavailable data:** If a field cannot be extracted, use `"NA"` as the value.
241
+
242
+ ## Output Structure
243
+
244
+ ```json
245
+ {
246
+ "batch_number": {batch_number},
247
+ "row_count": <number_of_rows_in_this_batch>,
248
+ "summaries": [
249
+ {
250
+ "row_number": <row_number>,
251
+ "<field_1>": "<value_or_NA>",
252
+ "<field_2>": "<value_or_NA>"
253
+ }
254
+ ]
255
+ }
256
+ ```
257
+
258
+ ## Output Schema
259
+
260
+ Your output must conform to this schema: `{schema_path}`
261
+
262
+ All fields are required. Use `"NA"` for unavailable values.
263
+
264
+ ## Mandatory Workflow
265
+
266
+ **Step 1:** Read rows using `tableGetRange`:
267
+ ```typescript
268
+ tableGetRange file_path="{file_path}" sheet_name="{sheet_name}" range="A{row_start}:Z{row_end}"
269
+ ```
270
+
271
+ **Step 2:** Build JSON in memory with all required fields
272
+
273
+ **Step 3:** Validate BEFORE writing:
274
+ ```typescript
275
+ jsonValidate data='<your_complete_json>' schema="{schema_path}"
276
+ ```
277
+
278
+ **Step 4:** Check result:
279
+ - If `valid: true` → Go to Step 5
280
+ - If `valid: false` → Fix errors listed in `errors` array, return to Step 3
281
+
282
+ **Step 5:** Write validated JSON to `{output_file}`
283
+
284
+ Output file should contain ONLY the JSON object (no markdown, no extra text).
285
+
286
+ ## Output Path
287
+ `{output_file}`
288
+ ```
289
+ ```
211
290
 
212
291
  ### Step 11: Create Directory Structure
213
292
 
@@ -226,31 +305,33 @@ Use `generate_prompts.py`:
226
305
 
227
306
  **Before Step 13 and Step 17:** Extract the full path to the skill directory from the `<skill_files>` section in the skill tool output. Use this path as `<skill_path>` in the commands below.
228
307
 
229
- **Unix-like shells:**
230
- ```bash
231
- uv run python <skill_path>/generate_prompts.py \
232
- --template .long-table-summary/{topic}/subagent_template.md \
233
- --output-dir .long-table-summary/{topic}/prompts \
234
- --num-batches {num_batches} \
235
- --sheet-name "{sheet_name}" \
236
- --file-path "{input_file}" \
237
- --start-row 2 \
238
- --batch-size {batch_size} \
239
- --instructions '{instructions_json}'
240
- ```
241
-
242
- **For Windows cmd.exe:**
243
- ```bash
244
- uv.exe run python <skill_path>\generate_prompts.py ^
245
- --template .long-table-summary\{topic}\subagent_template.md ^
246
- --output-dir .long-table-summary\{topic}\prompts ^
247
- --num-batches {num_batches} ^
248
- --sheet-name "{sheet_name}" ^
249
- --file-path "{input_file}" ^
250
- --start-row 2 ^
251
- --batch-size {batch_size} ^
252
- --instructions "{instructions_json}"
253
- ```
308
+ **Unix-like shells:**
309
+ ```bash
310
+ uv run python <skill_path>/generate_prompts.py \
311
+ --template .long-table-summary/{topic}/subagent_template.md \
312
+ --output-dir .long-table-summary/{topic}/prompts \
313
+ --num-batches {num_batches} \
314
+ --sheet-name "{sheet_name}" \
315
+ --file-path "{input_file}" \
316
+ --start-row 2 \
317
+ --batch-size {batch_size} \
318
+ --instructions '{instructions_json}' \
319
+ --schema-path ".long-table-summary/{topic}/schema.json"
320
+ ```
321
+
322
+ **For Windows cmd.exe:**
323
+ ```bash
324
+ uv.exe run python <skill_path>\generate_prompts.py ^
325
+ --template .long-table-summary\{topic}\subagent_template.md ^
326
+ --output-dir .long-table-summary\{topic}\prompts ^
327
+ --num-batches {num_batches} ^
328
+ --sheet-name "{sheet_name}" ^
329
+ --file-path "{input_file}" ^
330
+ --start-row 2 ^
331
+ --batch-size {batch_size} ^
332
+ --instructions "{instructions_json}" ^
333
+ --schema-path ".long-table-summary\{topic}\schema.json"
334
+ ```
254
335
 
255
336
  **Note:** The `{instructions_json}` is the user-confirmed JSON from Step 6.
256
337
 
@@ -293,43 +374,31 @@ For example:
293
374
 
294
375
  Do NOT inspect individual subagent outputs midway.
295
376
 
296
- ### Step 16: Retry Failed Batches
297
-
298
- After all batches are done, check for missing outputs:
299
-
300
- ```bash
301
- ls .long-table-summary/{topic}/outputs/
302
- ```
303
-
304
- **Missing files** = failed batches. Collect the batch numbers of all missing files.
305
-
306
- If there are failed batches:
307
-
308
- 1. Ask user using the `question` tool:
309
- - "Continue with failed batches or stop?
310
-
311
- Failed batches: batch003, batch007 (2 failures)
312
-
313
- Continue - Will retry each failed batch up to 3 times
314
- Stop - Keep current outputs and proceed to final report"
315
-
316
- (Replace the bracketed list with the actual missing batch numbers)
317
-
318
- 2. **Options:**
319
- - "Continue with failed batches"
320
- - "Stop and keep current outputs"
321
-
322
- 3. **If user selects "Continue":**
323
- - For each failed batch:
324
- a. Wait 2 seconds
325
- b. Retry with the same `subagent_type="general"`
326
- c. Up to 3 retry attempts
327
- - After retries, check again for remaining failures
328
- - If batches are still failing, repeat the question with the updated failed list
329
-
330
- 4. **If user selects "Stop":**
331
- - Do not retry any more batches
332
- - Proceed to Step 17 with whatever outputs exist
377
+ ### Step 16: Check for Missing Outputs
378
+
379
+ After all batches are done, check for missing outputs:
380
+
381
+ ```bash
382
+ ls .long-table-summary/{topic}/outputs/
383
+ ```
384
+
385
+ Missing files indicate subagent failure. If any are missing:
386
+
387
+ 1. Ask user using the `question` tool:
388
+ - "{number} batches failed. Retry failed batches or proceed with available outputs?"
389
+
390
+ 2. **Options:**
391
+ - "Retry failed batches"
392
+ - "Proceed with available outputs"
393
+
394
+ 3. **If user selects "Retry":**
395
+ - Re-launch subagent with same prompt file for each failed batch
396
+
397
+ 4. **If user selects "Proceed":**
398
+ - Continue to Step 17 with available outputs
399
+
400
+ Note: Since subagents validate their outputs before writing, existing files should contain valid JSON.
401
+
333
402
 
334
403
  ### Step 17: Combine All JSON Outputs
335
404
 
@@ -375,28 +444,30 @@ Provide user with:
375
444
 
376
445
  ## Python Scripts
377
446
 
378
- ### Script 1: `generate_prompts.py`
379
-
380
- **Arguments:**
381
- - `--template`: Path to subagent_template.md
382
- - `--output-dir`: Directory for generated prompts
383
- - `--num-batches`: Total number of batches
384
- - `--sheet-name`: Sheet name
385
- - `--file-path`: Full path to the input table file
386
- - `--start-row`: Starting data row (default: 2)
387
- - `--batch-size`: Rows per batch (default: 30)
388
- - `--instructions`: User-confirmed JSON with summarization fields
389
- - `--dry-run`: Validate without creating files (optional)
390
- - `--verbose`: Enable verbose output for debugging (optional)
391
-
392
- **Placeholders to replace:**
393
- - `{file_path}` → Absolute input file path
394
- - `{sheet_name}` → Sheet name
395
- - `{batch_number}` → Batch number (001, 002, etc.)
396
- - `{row_start}` → Start row
397
- - `{row_end}` → End row
398
- - `{output_file}` → Output file path
399
- - `{instructions_json}` → User's JSON instruction (properly escaped for markdown code block)
447
+ ### Script 1: `generate_prompts.py`
448
+
449
+ **Arguments:**
450
+ - `--template`: Path to subagent_template.md
451
+ - `--output-dir`: Directory for generated prompts
452
+ - `--num-batches`: Total number of batches
453
+ - `--sheet-name`: Sheet name
454
+ - `--file-path`: Full path to the input table file
455
+ - `--start-row`: Starting data row (default: 2)
456
+ - `--batch-size`: Rows per batch (default: 30)
457
+ - `--instructions`: User-confirmed JSON with summarization fields
458
+ - `--schema-path`: Path to output JSON Schema file (required)
459
+ - `--dry-run`: Validate without creating files (optional)
460
+ - `--verbose`: Enable verbose output for debugging (optional)
461
+
462
+ **Placeholders to replace:**
463
+ - `{file_path}` → Absolute input file path
464
+ - `{sheet_name}` → Sheet name
465
+ - `{batch_number}` → Batch number (001, 002, etc.)
466
+ - `{row_start}` → Start row
467
+ - `{row_end}` → End row
468
+ - `{output_file}` → Output file path
469
+ - `{instructions_json}` → User's JSON instruction (properly escaped for markdown code block)
470
+ - `{schema_path}` → Path to output JSON Schema file
400
471
 
401
472
  ### Script 2: `combine_outputs.py`
402
473
 
@@ -37,20 +37,17 @@ def read_json_outputs(input_dir: str, verbose: bool = False) -> Dict[str, Any]:
37
37
  with open(batch_file, "r", encoding="utf-8") as f:
38
38
  content = f.read().strip()
39
39
 
40
- # Find JSON in markdown (typically the entire content)
41
- json_start = content.find("{")
42
- json_end = content.rfind("}") + 1
43
-
44
- if json_start == -1 or json_end == 0:
40
+ # Extract JSON using brace matching (string-aware, handles nested structures)
41
+ extracted = extract_json_from_content(content)
42
+ if extracted is None:
45
43
  if verbose:
46
- print(f"Warning: No JSON found in {batch_file.name}")
44
+ print(f"Warning: No valid JSON found in {batch_file.name}")
47
45
  continue
48
46
 
49
- json_str = content[json_start:json_end]
50
- data = json.loads(json_str)
51
- all_summaries.append(data)
47
+ all_summaries.append(extracted)
52
48
 
53
49
  if verbose:
50
+ data = extracted
54
51
  print(
55
52
  f"Parsed: {batch_file.name} - {len(data.get('summaries', []))} summaries"
56
53
  )
@@ -67,6 +64,55 @@ def read_json_outputs(input_dir: str, verbose: bool = False) -> Dict[str, Any]:
67
64
  return {"success": True, "summaries": all_summaries}
68
65
 
69
66
 
67
+ def extract_json_from_content(content: str) -> dict | None:
68
+ """Extract JSON from content using string-aware brace matching.
69
+
70
+ Args:
71
+ content: File content string
72
+
73
+ Returns:
74
+ Parsed JSON dict or None if not found
75
+ """
76
+ # Try direct parse first (file contains only JSON)
77
+ try:
78
+ return json.loads(content)
79
+ except json.JSONDecodeError:
80
+ pass
81
+
82
+ # Find JSON object boundaries with brace matching
83
+ start = content.find("{")
84
+ if start == -1:
85
+ return None
86
+
87
+ depth = 0
88
+ in_string = False
89
+ escape = False
90
+
91
+ for i in range(start, len(content)):
92
+ char = content[i]
93
+ if escape:
94
+ escape = False
95
+ continue
96
+ if char == "\\":
97
+ escape = True
98
+ continue
99
+ if char == '"':
100
+ in_string = not in_string
101
+ continue
102
+ if not in_string:
103
+ if char == "{":
104
+ depth += 1
105
+ elif char == "}":
106
+ depth -= 1
107
+ if depth == 0:
108
+ try:
109
+ return json.loads(content[start : i + 1])
110
+ except json.JSONDecodeError:
111
+ return None
112
+
113
+ return None
114
+
115
+
70
116
  def merge_summaries(
71
117
  summaries: List[Dict[str, Any]],
72
118
  deduplicate: bool = False,
@@ -17,6 +17,7 @@ def generate_prompts(
17
17
  batch_size,
18
18
  file_path,
19
19
  instructions,
20
+ schema_path,
20
21
  dry_run=False,
21
22
  verbose=False,
22
23
  ):
@@ -31,6 +32,7 @@ def generate_prompts(
31
32
  batch_size: Rows per batch
32
33
  file_path: Full path to input table file
33
34
  instructions: User-provided summarization instructions (JSON string)
35
+ schema_path: Path to output JSON Schema file
34
36
  dry_run: Validate without creating files
35
37
  verbose: Enable verbose output
36
38
  """
@@ -83,6 +85,7 @@ def generate_prompts(
83
85
  content = content.replace("{row_end}", str(row_end))
84
86
  content = content.replace("{output_file}", output_file)
85
87
  content = content.replace("{instructions_json}", instructions_escaped)
88
+ content = content.replace("{schema_path}", schema_path)
86
89
 
87
90
  # Dry run mode - skip actual file writes
88
91
  if dry_run:
@@ -147,6 +150,11 @@ def main():
147
150
  required=True,
148
151
  help="User-provided summarization instructions (JSON string)",
149
152
  )
153
+ parser.add_argument(
154
+ "--schema-path",
155
+ required=True,
156
+ help="Path to output JSON Schema file (relative or absolute)",
157
+ )
150
158
  parser.add_argument(
151
159
  "--dry-run",
152
160
  action="store_true",
@@ -200,6 +208,7 @@ def main():
200
208
  batch_size=args.batch_size,
201
209
  file_path=args.file_path,
202
210
  instructions=args.instructions,
211
+ schema_path=args.schema_path,
203
212
  dry_run=args.dry_run,
204
213
  verbose=args.verbose,
205
214
  )