@exulu/backend 1.48.2 → 1.49.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. package/dist/index.cjs +351 -42
  2. package/dist/index.d.cts +96 -1
  3. package/dist/index.d.ts +96 -1
  4. package/dist/index.js +340 -38
  5. package/ee/{markdown.ts → chunking/markdown.ts} +2 -2
  6. package/ee/python/README.md +295 -0
  7. package/ee/python/documents/processing/README.md +155 -0
  8. package/ee/{documents → python/documents}/processing/doc_processor.ts +25 -17
  9. package/ee/{documents/processing/pdf_to_markdown.py → python/documents/processing/document_to_markdown.py} +3 -10
  10. package/ee/python/setup.sh +180 -0
  11. package/package.json +14 -3
  12. package/scripts/postinstall.cjs +149 -0
  13. package/.agents/skills/mintlify/SKILL.md +0 -347
  14. package/.editorconfig +0 -15
  15. package/.eslintrc.json +0 -52
  16. package/.github/workflows/release-backend.yml +0 -38
  17. package/.husky/commit-msg +0 -1
  18. package/.jscpd.json +0 -18
  19. package/.mcp.json +0 -25
  20. package/.nvmrc +0 -1
  21. package/.prettierignore +0 -5
  22. package/.prettierrc.json +0 -12
  23. package/CHANGELOG.md +0 -8
  24. package/SECURITY.md +0 -5
  25. package/commitlint.config.js +0 -4
  26. package/devops/documentation/patch-older-releases.md +0 -42
  27. package/ee/documents/processing/build_pdf_processor.sh +0 -35
  28. package/ee/documents/processing/chunk_markdown.py +0 -263
  29. package/ee/documents/processing/pdf_processor.spec +0 -115
  30. package/eslint.config.js +0 -88
  31. package/jest.config.ts +0 -25
  32. package/mintlify-docs/.mintignore +0 -7
  33. package/mintlify-docs/AGENTS.md +0 -33
  34. package/mintlify-docs/CLAUDE.MD +0 -50
  35. package/mintlify-docs/CONTRIBUTING.md +0 -32
  36. package/mintlify-docs/LICENSE +0 -21
  37. package/mintlify-docs/README.md +0 -55
  38. package/mintlify-docs/ai-tools/claude-code.mdx +0 -43
  39. package/mintlify-docs/ai-tools/cursor.mdx +0 -39
  40. package/mintlify-docs/ai-tools/windsurf.mdx +0 -39
  41. package/mintlify-docs/api-reference/core-types/agent-types.mdx +0 -110
  42. package/mintlify-docs/api-reference/core-types/analytics-types.mdx +0 -95
  43. package/mintlify-docs/api-reference/core-types/configuration-types.mdx +0 -83
  44. package/mintlify-docs/api-reference/core-types/evaluation-types.mdx +0 -106
  45. package/mintlify-docs/api-reference/core-types/job-types.mdx +0 -135
  46. package/mintlify-docs/api-reference/core-types/overview.mdx +0 -73
  47. package/mintlify-docs/api-reference/core-types/prompt-types.mdx +0 -102
  48. package/mintlify-docs/api-reference/core-types/rbac-types.mdx +0 -163
  49. package/mintlify-docs/api-reference/core-types/session-types.mdx +0 -77
  50. package/mintlify-docs/api-reference/core-types/user-management.mdx +0 -112
  51. package/mintlify-docs/api-reference/core-types/workflow-types.mdx +0 -88
  52. package/mintlify-docs/api-reference/core-types.mdx +0 -585
  53. package/mintlify-docs/api-reference/dynamic-types.mdx +0 -851
  54. package/mintlify-docs/api-reference/endpoint/create.mdx +0 -4
  55. package/mintlify-docs/api-reference/endpoint/delete.mdx +0 -4
  56. package/mintlify-docs/api-reference/endpoint/get.mdx +0 -4
  57. package/mintlify-docs/api-reference/endpoint/webhook.mdx +0 -4
  58. package/mintlify-docs/api-reference/introduction.mdx +0 -661
  59. package/mintlify-docs/api-reference/mutations.mdx +0 -1012
  60. package/mintlify-docs/api-reference/openapi.json +0 -217
  61. package/mintlify-docs/api-reference/queries.mdx +0 -1154
  62. package/mintlify-docs/backend/introduction.mdx +0 -218
  63. package/mintlify-docs/changelog.mdx +0 -387
  64. package/mintlify-docs/community-edition.mdx +0 -304
  65. package/mintlify-docs/core/exulu-agent/api-reference.mdx +0 -894
  66. package/mintlify-docs/core/exulu-agent/configuration.mdx +0 -690
  67. package/mintlify-docs/core/exulu-agent/introduction.mdx +0 -552
  68. package/mintlify-docs/core/exulu-app/api-reference.mdx +0 -481
  69. package/mintlify-docs/core/exulu-app/configuration.mdx +0 -319
  70. package/mintlify-docs/core/exulu-app/introduction.mdx +0 -117
  71. package/mintlify-docs/core/exulu-authentication.mdx +0 -810
  72. package/mintlify-docs/core/exulu-chunkers/api-reference.mdx +0 -1011
  73. package/mintlify-docs/core/exulu-chunkers/configuration.mdx +0 -596
  74. package/mintlify-docs/core/exulu-chunkers/introduction.mdx +0 -403
  75. package/mintlify-docs/core/exulu-context/api-reference.mdx +0 -911
  76. package/mintlify-docs/core/exulu-context/configuration.mdx +0 -648
  77. package/mintlify-docs/core/exulu-context/introduction.mdx +0 -394
  78. package/mintlify-docs/core/exulu-database.mdx +0 -811
  79. package/mintlify-docs/core/exulu-default-agents.mdx +0 -545
  80. package/mintlify-docs/core/exulu-eval/api-reference.mdx +0 -772
  81. package/mintlify-docs/core/exulu-eval/configuration.mdx +0 -680
  82. package/mintlify-docs/core/exulu-eval/introduction.mdx +0 -459
  83. package/mintlify-docs/core/exulu-logging.mdx +0 -464
  84. package/mintlify-docs/core/exulu-otel.mdx +0 -670
  85. package/mintlify-docs/core/exulu-queues/api-reference.mdx +0 -648
  86. package/mintlify-docs/core/exulu-queues/configuration.mdx +0 -650
  87. package/mintlify-docs/core/exulu-queues/introduction.mdx +0 -474
  88. package/mintlify-docs/core/exulu-reranker/api-reference.mdx +0 -630
  89. package/mintlify-docs/core/exulu-reranker/configuration.mdx +0 -663
  90. package/mintlify-docs/core/exulu-reranker/introduction.mdx +0 -516
  91. package/mintlify-docs/core/exulu-tool/api-reference.mdx +0 -723
  92. package/mintlify-docs/core/exulu-tool/configuration.mdx +0 -805
  93. package/mintlify-docs/core/exulu-tool/introduction.mdx +0 -539
  94. package/mintlify-docs/core/exulu-variables/api-reference.mdx +0 -699
  95. package/mintlify-docs/core/exulu-variables/configuration.mdx +0 -736
  96. package/mintlify-docs/core/exulu-variables/introduction.mdx +0 -511
  97. package/mintlify-docs/development.mdx +0 -94
  98. package/mintlify-docs/docs.json +0 -248
  99. package/mintlify-docs/enterprise-edition.mdx +0 -538
  100. package/mintlify-docs/essentials/code.mdx +0 -35
  101. package/mintlify-docs/essentials/images.mdx +0 -59
  102. package/mintlify-docs/essentials/markdown.mdx +0 -88
  103. package/mintlify-docs/essentials/navigation.mdx +0 -87
  104. package/mintlify-docs/essentials/reusable-snippets.mdx +0 -110
  105. package/mintlify-docs/essentials/settings.mdx +0 -318
  106. package/mintlify-docs/favicon.svg +0 -3
  107. package/mintlify-docs/frontend/introduction.mdx +0 -39
  108. package/mintlify-docs/getting-started.mdx +0 -267
  109. package/mintlify-docs/guides/custom-agent.mdx +0 -608
  110. package/mintlify-docs/guides/first-agent.mdx +0 -315
  111. package/mintlify-docs/images/admin_ui.png +0 -0
  112. package/mintlify-docs/images/contexts.png +0 -0
  113. package/mintlify-docs/images/create_agents.png +0 -0
  114. package/mintlify-docs/images/evals.png +0 -0
  115. package/mintlify-docs/images/graphql.png +0 -0
  116. package/mintlify-docs/images/graphql_api.png +0 -0
  117. package/mintlify-docs/images/hero-dark.png +0 -0
  118. package/mintlify-docs/images/hero-light.png +0 -0
  119. package/mintlify-docs/images/hero.png +0 -0
  120. package/mintlify-docs/images/knowledge_sources.png +0 -0
  121. package/mintlify-docs/images/mcp.png +0 -0
  122. package/mintlify-docs/images/scaling.png +0 -0
  123. package/mintlify-docs/index.mdx +0 -411
  124. package/mintlify-docs/logo/dark.svg +0 -9
  125. package/mintlify-docs/logo/light.svg +0 -9
  126. package/mintlify-docs/partners.mdx +0 -558
  127. package/mintlify-docs/products.mdx +0 -77
  128. package/mintlify-docs/snippets/snippet-intro.mdx +0 -4
  129. package/mintlify-docs/styles.css +0 -207
  130. package/ngrok.bash +0 -1
  131. package/ngrok.md +0 -6
  132. package/ngrok.yml +0 -10
  133. package/release.config.cjs +0 -15
  134. package/skills-lock.json +0 -10
  135. package/types/context-processor.ts +0 -45
  136. package/types/enums/eval-types.ts +0 -5
  137. package/types/enums/field-types.ts +0 -1
  138. package/types/enums/jobs.ts +0 -11
  139. package/types/enums/statistics.ts +0 -13
  140. package/types/exulu-table-definition.ts +0 -79
  141. package/types/file-types.ts +0 -18
  142. package/types/models/agent-session.ts +0 -27
  143. package/types/models/agent.ts +0 -68
  144. package/types/models/context.ts +0 -53
  145. package/types/models/embedding.ts +0 -17
  146. package/types/models/eval-run.ts +0 -40
  147. package/types/models/exulu-agent-tool-config.ts +0 -11
  148. package/types/models/item.ts +0 -21
  149. package/types/models/job.ts +0 -8
  150. package/types/models/project.ts +0 -16
  151. package/types/models/rate-limiter-rules.ts +0 -7
  152. package/types/models/test-case.ts +0 -25
  153. package/types/models/tool.ts +0 -9
  154. package/types/models/user-role.ts +0 -12
  155. package/types/models/user.ts +0 -20
  156. package/types/models/variable.ts +0 -8
  157. package/types/models/vector-methods.ts +0 -7
  158. package/types/provider-config.ts +0 -21
  159. package/types/queue-config.ts +0 -16
  160. package/types/rbac-rights-modes.ts +0 -1
  161. package/types/statistics.ts +0 -20
  162. package/types/workflow.ts +0 -31
  163. /package/ee/{documents → python/documents}/THIRD_PARTY_LICENSES/docling.txt +0 -0
  164. /package/ee/{documents/processing → python}/requirements.txt +0 -0
@@ -0,0 +1,295 @@
1
+ # Exulu Python Integration
2
+
3
+ This directory contains Python scripts and utilities used by the Exulu backend. The integration is designed to be seamless for TypeScript developers, requiring minimal Python knowledge.
4
+
5
+ ## Quick Start
6
+
7
+ ### First-Time Setup
8
+
9
+ Run the setup command to configure your Python environment:
10
+
11
+ ```bash
12
+ npm run python:setup
13
+ ```
14
+
15
+ This will:
16
+ - ✅ Validate Python 3.10+ is installed
17
+ - ✅ Create a virtual environment at `ee/python/.venv`
18
+ - ✅ Install all required dependencies
19
+ - ✅ Verify the installation
20
+
21
+ **That's it!** You're ready to use Python scripts from TypeScript.
22
+
23
+ ## Available npm Scripts
24
+
25
+ | Command | Description |
26
+ |---------|-------------|
27
+ | `npm run python:setup` | Initial setup - creates venv and installs dependencies |
28
+ | `npm run python:install` | Install/update Python dependencies |
29
+ | `npm run python:validate` | Verify Python environment is working |
30
+ | `npm run python:clean` | Clean Python cache and virtual environment |
31
+ | `npm run python:rebuild` | Clean and rebuild Python environment from scratch |
32
+
33
+ ## Using Python Scripts from TypeScript
34
+
35
+ The `python-executor` utility provides a type-safe interface for calling Python scripts:
36
+
37
+ ### Basic Example
38
+
39
+ ```typescript
40
+ import { executePythonScript } from '../utils/python-executor';
41
+
42
+ // Execute a Python script
43
+ const result = await executePythonScript({
44
+ scriptPath: 'ee/python/documents/processing/document_to_markdown.py',
45
+ args: [
46
+ '/path/to/document.pdf',
47
+ '-o', '/output/processed.json',
48
+ '--images-dir', '/output/images'
49
+ ]
50
+ });
51
+
52
+ if (result.success) {
53
+ console.log('Script executed successfully!');
54
+ console.log('Output:', result.stdout);
55
+ } else {
56
+ console.error('Script failed:', result.stderr);
57
+ }
58
+ ```
59
+
60
+ ### Simple Usage (throws on error)
61
+
62
+ ```typescript
63
+ import { executePythonScriptSimple } from '../utils/python-executor';
64
+
65
+ // Get stdout directly, throws on error
66
+ const output = await executePythonScriptSimple({
67
+ scriptPath: 'ee/python/my_script.py',
68
+ args: ['arg1', 'arg2']
69
+ });
70
+
71
+ console.log('Output:', output);
72
+ ```
73
+
74
+ ### Advanced Configuration
75
+
76
+ ```typescript
77
+ import { executePythonScript } from '../utils/python-executor';
78
+
79
+ const result = await executePythonScript({
80
+ scriptPath: 'ee/python/my_script.py',
81
+ args: ['--verbose'],
82
+ cwd: process.cwd(),
83
+ timeout: 600000, // 10 minutes
84
+ env: {
85
+ CUSTOM_VAR: 'value'
86
+ },
87
+ validateEnvironment: true // default
88
+ });
89
+ ```
90
+
91
+ ### Error Handling
92
+
93
+ ```typescript
94
+ import {
95
+ executePythonScript,
96
+ PythonEnvironmentError,
97
+ PythonExecutionError
98
+ } from '../utils/python-executor';
99
+
100
+ try {
101
+ const result = await executePythonScript({
102
+ scriptPath: 'ee/python/my_script.py'
103
+ });
104
+
105
+ // Handle success
106
+ console.log(result.stdout);
107
+
108
+ } catch (error) {
109
+ if (error instanceof PythonEnvironmentError) {
110
+ // Python environment not set up
111
+ console.error('Please run: npm run python:setup');
112
+ } else if (error instanceof PythonExecutionError) {
113
+ // Script execution failed
114
+ console.error('Script error:', error.stderr);
115
+ console.error('Exit code:', error.exitCode);
116
+ }
117
+ }
118
+ ```
119
+
120
+ ### Check Environment Status
121
+
122
+ ```typescript
123
+ import { isPythonEnvironmentReady } from '../utils/python-executor';
124
+
125
+ if (await isPythonEnvironmentReady()) {
126
+ console.log('Python environment is ready!');
127
+ } else {
128
+ console.log('Please run: npm run python:setup');
129
+ }
130
+ ```
131
+
132
+ ## Directory Structure
133
+
134
+ ```
135
+ ee/python/
136
+ ├── .venv/ # Virtual environment (gitignored)
137
+ ├── requirements.txt # Python dependencies
138
+ ├── setup.sh # Setup script
139
+ ├── README.md # This file
140
+ └── documents/
141
+ └── processing/
142
+ ├── document_to_markdown.py
143
+ └── ...
144
+ ```
145
+
146
+ ## Adding New Python Scripts
147
+
148
+ 1. **Create your script** in an appropriate subdirectory under `ee/python/`
149
+
150
+ 2. **Add dependencies** to `requirements.txt`:
151
+ ```
152
+ your-package==1.2.3
153
+ ```
154
+
155
+ 3. **Update dependencies**:
156
+ ```bash
157
+ npm run python:install
158
+ ```
159
+
160
+ 4. **Use from TypeScript**:
161
+ ```typescript
162
+ import { executePythonScript } from '../utils/python-executor';
163
+
164
+ const result = await executePythonScript({
165
+ scriptPath: 'ee/python/your-module/your-script.py',
166
+ args: ['arg1', 'arg2']
167
+ });
168
+ ```
169
+
170
+ ## Troubleshooting
171
+
172
+ ### Python environment not found
173
+
174
+ ```bash
175
+ npm run python:setup
176
+ ```
177
+
178
+ ### Dependencies not installing
179
+
180
+ ```bash
181
+ npm run python:rebuild
182
+ ```
183
+
184
+ ### Script execution fails
185
+
186
+ 1. **Validate environment**:
187
+ ```bash
188
+ npm run python:validate
189
+ ```
190
+
191
+ 2. **Check Python version**:
192
+ ```bash
193
+ source ee/python/.venv/bin/activate
194
+ python --version # Should be 3.10+
195
+ ```
196
+
197
+ 3. **Test manually**:
198
+ ```bash
199
+ source ee/python/.venv/bin/activate
200
+ python ee/python/your-script.py --help
201
+ ```
202
+
203
+ ### Import errors in Python scripts
204
+
205
+ Make sure all required packages are in `requirements.txt` and run:
206
+ ```bash
207
+ npm run python:install
208
+ ```
209
+
210
+ ## Requirements
211
+
212
+ - **Python**: 3.10 or higher
213
+ - **pip**: Latest version (auto-upgraded during setup)
214
+ - **Operating System**: macOS, Linux, or Windows with WSL
215
+
216
+ ### Installing Python
217
+
218
+ **macOS:**
219
+ ```bash
220
+ brew install python@3.12
221
+ ```
222
+
223
+ **Ubuntu/Debian:**
224
+ ```bash
225
+ sudo apt-get update
226
+ sudo apt-get install python3.12 python3.12-venv python3-pip
227
+ ```
228
+
229
+ **Windows:**
230
+ Download from [python.org](https://www.python.org/downloads/)
231
+
232
+ ## CI/CD Integration
233
+
234
+ ### GitHub Actions Example
235
+
236
+ ```yaml
237
+ - name: Setup Python Environment
238
+ run: npm run python:setup
239
+
240
+ - name: Validate Python Environment
241
+ run: npm run python:validate
242
+
243
+ - name: Run Tests
244
+ run: npm test
245
+ ```
246
+
247
+ ### Caching Virtual Environment
248
+
249
+ ```yaml
250
+ - uses: actions/cache@v3
251
+ with:
252
+ path: ee/python/.venv
253
+ key: ${{ runner.os }}-python-${{ hashFiles('ee/python/requirements.txt') }}
254
+ ```
255
+
256
+ ## Best Practices
257
+
258
+ 1. **Always use the TypeScript wrapper** - Don't call Python directly with `exec()`
259
+ 2. **Pin dependency versions** in `requirements.txt` for reproducibility
260
+ 3. **Handle errors gracefully** - Use try/catch with specific error types
261
+ 4. **Set appropriate timeouts** - Long-running scripts should have higher timeouts
262
+ 5. **Log errors properly** - Both stdout and stderr should be logged
263
+ 6. **Test environment setup** - Run `npm run python:validate` in CI
264
+
265
+ ## Available Python Modules
266
+
267
+ ### Document Processing
268
+
269
+ **Script:** `documents/processing/document_to_markdown.py`
270
+
271
+ Converts documents (PDF, DOCX, etc.) to structured JSON with markdown content.
272
+
273
+ **Usage:**
274
+ ```typescript
275
+ import { executePythonScript } from '../utils/python-executor';
276
+
277
+ const result = await executePythonScript({
278
+ scriptPath: 'ee/python/documents/processing/document_to_markdown.py',
279
+ args: [
280
+ '/path/to/document.pdf',
281
+ '-o', '/output/processed.json',
282
+ '--images-dir', '/output/images'
283
+ ]
284
+ });
285
+
286
+ const pages = JSON.parse(result.stdout);
287
+ ```
288
+
289
+ ## Support
290
+
291
+ For issues or questions:
292
+ 1. Check this README
293
+ 2. Run `npm run python:validate`
294
+ 3. Check the [troubleshooting](#troubleshooting) section
295
+ 4. Open an issue with logs from `npm run python:validate`
@@ -0,0 +1,155 @@
1
+ # Document Processing Module
2
+
3
+ Python scripts for processing documents (PDF, DOCX, etc.) into structured formats.
4
+
5
+ ## document_to_markdown.py
6
+
7
+ Converts documents to structured JSON with page-separated markdown content and extracted images.
8
+
9
+ ### Features
10
+
11
+ - ✅ PDF, DOCX, PPTX, and other document formats
12
+ - ✅ Page-by-page content extraction
13
+ - ✅ Hierarchical heading structure
14
+ - ✅ Image extraction with high resolution
15
+ - ✅ Table preservation in markdown format
16
+ - ✅ Normalized whitespace handling
17
+
18
+ ### Usage from TypeScript
19
+
20
+ ```typescript
21
+ import { executePythonScript } from '../../../../src/utils/python-executor';
22
+ import { readFile } from 'fs/promises';
23
+ import { join } from 'path';
24
+
25
+ async function processDocument(documentPath: string, outputDir: string) {
26
+ try {
27
+ // Execute the document processor
28
+ const result = await executePythonScript({
29
+ scriptPath: 'ee/python/documents/processing/document_to_markdown.py',
30
+ args: [
31
+ documentPath,
32
+ '-o', join(outputDir, 'processed.json'),
33
+ '--images-dir', join(outputDir, 'images')
34
+ ],
35
+ timeout: 600000 // 10 minutes for large documents
36
+ });
37
+
38
+ if (!result.success) {
39
+ throw new Error(`Processing failed: ${result.stderr}`);
40
+ }
41
+
42
+ // Read the processed JSON
43
+ const processedData = JSON.parse(
44
+ await readFile(join(outputDir, 'processed.json'), 'utf-8')
45
+ );
46
+
47
+ return processedData;
48
+ } catch (error) {
49
+ console.error('Document processing error:', error);
50
+ throw error;
51
+ }
52
+ }
53
+
54
+ // Example usage
55
+ const pages = await processDocument(
56
+ '/path/to/document.pdf',
57
+ '/output/directory'
58
+ );
59
+
60
+ // Access page content
61
+ pages.forEach((page, index) => {
62
+ console.log(`Page ${page.page}:`);
63
+ console.log(`Content: ${page.content.substring(0, 100)}...`);
64
+ console.log(`Image: ${page.image || 'None'}`);
65
+ console.log(`Headings:`, page.headings);
66
+ });
67
+ ```
68
+
69
+ ### Command-Line Usage
70
+
71
+ ```bash
72
+ # Activate virtual environment
73
+ source ee/python/.venv/bin/activate
74
+
75
+ # Process a document
76
+ python ee/python/documents/processing/document_to_markdown.py \
77
+ /path/to/document.pdf \
78
+ -o /output/processed.json \
79
+ --images-dir /output/images
80
+ ```
81
+
82
+ ### Output Format
83
+
84
+ The script outputs a JSON array with page objects:
85
+
86
+ ```json
87
+ [
88
+ {
89
+ "page": 1,
90
+ "content": "# Document Title\n\nFirst paragraph...",
91
+ "image": "/output/images/page_1.png",
92
+ "headings": {
93
+ "Document Title": null
94
+ }
95
+ },
96
+ {
97
+ "page": 2,
98
+ "content": "## Section 1\n\nMore content...",
99
+ "image": "/output/images/page_2.png",
100
+ "headings": {
101
+ "Document Title": {
102
+ "Section 1": null
103
+ }
104
+ }
105
+ }
106
+ ]
107
+ ```
108
+
109
+ ### Arguments
110
+
111
+ | Argument | Description | Required |
112
+ |----------|-------------|----------|
113
+ | `pdf_path` | Path to the document file | Yes |
114
+ | `-o, --output` | Output path for JSON file | No (default: `<document_name>/processed.json`) |
115
+ | `--images-dir` | Directory to save page images | No (default: `<output_dir>/images`) |
116
+
117
+ ### Configuration
118
+
119
+ You can modify these constants in the script:
120
+
121
+ ```python
122
+ IMAGE_RESOLUTION_SCALE = 2.0 # Image resolution multiplier
123
+ ```
124
+
125
+ ### Dependencies
126
+
127
+ This script requires the following Python packages (installed via `npm run python:setup`):
128
+
129
+ - `docling` - Document conversion
130
+ - `docling-hierarchical-pdf` - Hierarchical heading processing
131
+ - `transformers` - ML-based text processing
132
+ - `PIL` - Image handling
133
+
134
+ ### Troubleshooting
135
+
136
+ **Issue: ImportError for docling**
137
+ ```bash
138
+ npm run python:install
139
+ ```
140
+
141
+ **Issue: Script timeout for large documents**
142
+ ```typescript
143
+ // Increase timeout
144
+ const result = await executePythonScript({
145
+ scriptPath: '...',
146
+ timeout: 1200000 // 20 minutes
147
+ });
148
+ ```
149
+
150
+ **Issue: Low-quality images**
151
+
152
+ Increase `IMAGE_RESOLUTION_SCALE` in the script:
153
+ ```python
154
+ IMAGE_RESOLUTION_SCALE = 3.0 # Higher quality
155
+ ```
@@ -13,8 +13,7 @@ import TurndownService from 'turndown';
13
13
  import WordExtractor from 'word-extractor';
14
14
  import { parseOfficeAsync } from "officeparser";
15
15
  import { checkLicense } from '@EE/entitlements';
16
-
17
- const execAsync = promisify(exec);
16
+ import { executePythonScript } from '@SRC/utils/python-executor';
18
17
 
19
18
  type DocumentProcessorConfig = {
20
19
  vlm?: {
@@ -430,19 +429,28 @@ async function processPdf(
430
429
  ): Promise<ProcessorOutput> {
431
430
  try {
432
431
  let json: ProcessedDocument;
433
- // Call the PDF processor executable
432
+ // Call the PDF processor script
434
433
  if (config?.docling) {
435
- /* `python3 pdf_to_markdown.py "${paths.source}" --output "${paths.json}" --images-dir "${paths.images}"` */
436
- const script = `modal run modal_script.py --pdf-path "${paths.source}" --output "${paths.json}" --images-dir "${paths.images}"`
437
- console.log(`[EXULU] Running python script: ${script}`);
438
- const { stderr } = await execAsync(
439
- // todo replace python3 with the compiled executable
440
- script,
441
- { maxBuffer: 2000 * 1024 * 1024 } // 2000 MB buffer for large outputs
442
- );
443
- // Log stderr (processing info, not errors)
444
- if (stderr) {
445
- console.log('Processing info:', stderr.trim());
434
+
435
+ console.log(`[EXULU] Processing document with document_to_markdown.py`);
436
+
437
+ const result = await executePythonScript({
438
+ scriptPath: 'ee/python/documents/processing/document_to_markdown.py',
439
+ args: [
440
+ paths.source,
441
+ '-o', paths.json,
442
+ '--images-dir', paths.images
443
+ ],
444
+ timeout: 30 * 60 * 1000, // 30 minutes for large documents
445
+ });
446
+
447
+ // Log processing info from stderr
448
+ if (result.stderr) {
449
+ console.log('Processing info:', result.stderr.trim());
450
+ }
451
+
452
+ if (!result.success) {
453
+ throw new Error(`Document processing failed: ${result.stderr}`);
446
454
  }
447
455
 
448
456
  // Read the generated JSON file
@@ -460,9 +468,9 @@ async function processPdf(
460
468
  }];
461
469
  }
462
470
 
463
- console.log(`\n✓ Document processing completed successfully`);
464
- console.log(` Total pages: ${json.length}`);
465
- console.log(` Output file: ${paths.json}`);
471
+ console.log(`[EXULU] \n✓ Document processing completed successfully`);
472
+ console.log(`[EXULU] Total pages: ${json.length}`);
473
+ console.log(`[EXULU] Output file: ${paths.json}`);
466
474
 
467
475
  if (!config?.docling && config?.vlm?.model) {
468
476
  console.error('[EXULU] VLM validation is only supported when docling is enabled, skipping validation.');
@@ -1,10 +1,10 @@
1
1
  #!/usr/bin/env python3
2
2
  """
3
- PDF to Markdown Converter using Docling
4
- Converts a PDF to JSON with page-separated markdown and images.
3
+ Document to Markdown Converter using Docling
4
+ Converts a document to JSON with page-separated markdown and images.
5
5
 
6
6
  Usage:
7
- pdf_to_markdown.py <pdf_file_path> [-o OUTPUT_PATH] [--max-tokens MAX_TOKENS]
7
+ document_to_markdown.py <document_file_path> [-o OUTPUT_PATH] [--images-dir IMAGES_DIR]
8
8
  """
9
9
 
10
10
  import sys
@@ -346,13 +346,6 @@ def main():
346
346
  help='Directory to save page images (default: <pdf_name>_images/)'
347
347
  )
348
348
 
349
- parser.add_argument(
350
- '--max-tokens',
351
- type=int,
352
- dest='max_tokens',
353
- help='Maximum number of tokens (currently not used, reserved for future use)'
354
- )
355
-
356
349
  # Parse arguments
357
350
  args = parser.parse_args()
358
351