@exulu/backend 1.48.2 → 1.49.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +351 -42
- package/dist/index.d.cts +96 -1
- package/dist/index.d.ts +96 -1
- package/dist/index.js +340 -38
- package/ee/{markdown.ts → chunking/markdown.ts} +2 -2
- package/ee/python/README.md +295 -0
- package/ee/python/documents/processing/README.md +155 -0
- package/ee/{documents → python/documents}/processing/doc_processor.ts +25 -17
- package/ee/{documents/processing/pdf_to_markdown.py → python/documents/processing/document_to_markdown.py} +3 -10
- package/ee/python/setup.sh +180 -0
- package/package.json +14 -3
- package/scripts/postinstall.cjs +149 -0
- package/.agents/skills/mintlify/SKILL.md +0 -347
- package/.editorconfig +0 -15
- package/.eslintrc.json +0 -52
- package/.github/workflows/release-backend.yml +0 -38
- package/.husky/commit-msg +0 -1
- package/.jscpd.json +0 -18
- package/.mcp.json +0 -25
- package/.nvmrc +0 -1
- package/.prettierignore +0 -5
- package/.prettierrc.json +0 -12
- package/CHANGELOG.md +0 -8
- package/SECURITY.md +0 -5
- package/commitlint.config.js +0 -4
- package/devops/documentation/patch-older-releases.md +0 -42
- package/ee/documents/processing/build_pdf_processor.sh +0 -35
- package/ee/documents/processing/chunk_markdown.py +0 -263
- package/ee/documents/processing/pdf_processor.spec +0 -115
- package/eslint.config.js +0 -88
- package/jest.config.ts +0 -25
- package/mintlify-docs/.mintignore +0 -7
- package/mintlify-docs/AGENTS.md +0 -33
- package/mintlify-docs/CLAUDE.MD +0 -50
- package/mintlify-docs/CONTRIBUTING.md +0 -32
- package/mintlify-docs/LICENSE +0 -21
- package/mintlify-docs/README.md +0 -55
- package/mintlify-docs/ai-tools/claude-code.mdx +0 -43
- package/mintlify-docs/ai-tools/cursor.mdx +0 -39
- package/mintlify-docs/ai-tools/windsurf.mdx +0 -39
- package/mintlify-docs/api-reference/core-types/agent-types.mdx +0 -110
- package/mintlify-docs/api-reference/core-types/analytics-types.mdx +0 -95
- package/mintlify-docs/api-reference/core-types/configuration-types.mdx +0 -83
- package/mintlify-docs/api-reference/core-types/evaluation-types.mdx +0 -106
- package/mintlify-docs/api-reference/core-types/job-types.mdx +0 -135
- package/mintlify-docs/api-reference/core-types/overview.mdx +0 -73
- package/mintlify-docs/api-reference/core-types/prompt-types.mdx +0 -102
- package/mintlify-docs/api-reference/core-types/rbac-types.mdx +0 -163
- package/mintlify-docs/api-reference/core-types/session-types.mdx +0 -77
- package/mintlify-docs/api-reference/core-types/user-management.mdx +0 -112
- package/mintlify-docs/api-reference/core-types/workflow-types.mdx +0 -88
- package/mintlify-docs/api-reference/core-types.mdx +0 -585
- package/mintlify-docs/api-reference/dynamic-types.mdx +0 -851
- package/mintlify-docs/api-reference/endpoint/create.mdx +0 -4
- package/mintlify-docs/api-reference/endpoint/delete.mdx +0 -4
- package/mintlify-docs/api-reference/endpoint/get.mdx +0 -4
- package/mintlify-docs/api-reference/endpoint/webhook.mdx +0 -4
- package/mintlify-docs/api-reference/introduction.mdx +0 -661
- package/mintlify-docs/api-reference/mutations.mdx +0 -1012
- package/mintlify-docs/api-reference/openapi.json +0 -217
- package/mintlify-docs/api-reference/queries.mdx +0 -1154
- package/mintlify-docs/backend/introduction.mdx +0 -218
- package/mintlify-docs/changelog.mdx +0 -387
- package/mintlify-docs/community-edition.mdx +0 -304
- package/mintlify-docs/core/exulu-agent/api-reference.mdx +0 -894
- package/mintlify-docs/core/exulu-agent/configuration.mdx +0 -690
- package/mintlify-docs/core/exulu-agent/introduction.mdx +0 -552
- package/mintlify-docs/core/exulu-app/api-reference.mdx +0 -481
- package/mintlify-docs/core/exulu-app/configuration.mdx +0 -319
- package/mintlify-docs/core/exulu-app/introduction.mdx +0 -117
- package/mintlify-docs/core/exulu-authentication.mdx +0 -810
- package/mintlify-docs/core/exulu-chunkers/api-reference.mdx +0 -1011
- package/mintlify-docs/core/exulu-chunkers/configuration.mdx +0 -596
- package/mintlify-docs/core/exulu-chunkers/introduction.mdx +0 -403
- package/mintlify-docs/core/exulu-context/api-reference.mdx +0 -911
- package/mintlify-docs/core/exulu-context/configuration.mdx +0 -648
- package/mintlify-docs/core/exulu-context/introduction.mdx +0 -394
- package/mintlify-docs/core/exulu-database.mdx +0 -811
- package/mintlify-docs/core/exulu-default-agents.mdx +0 -545
- package/mintlify-docs/core/exulu-eval/api-reference.mdx +0 -772
- package/mintlify-docs/core/exulu-eval/configuration.mdx +0 -680
- package/mintlify-docs/core/exulu-eval/introduction.mdx +0 -459
- package/mintlify-docs/core/exulu-logging.mdx +0 -464
- package/mintlify-docs/core/exulu-otel.mdx +0 -670
- package/mintlify-docs/core/exulu-queues/api-reference.mdx +0 -648
- package/mintlify-docs/core/exulu-queues/configuration.mdx +0 -650
- package/mintlify-docs/core/exulu-queues/introduction.mdx +0 -474
- package/mintlify-docs/core/exulu-reranker/api-reference.mdx +0 -630
- package/mintlify-docs/core/exulu-reranker/configuration.mdx +0 -663
- package/mintlify-docs/core/exulu-reranker/introduction.mdx +0 -516
- package/mintlify-docs/core/exulu-tool/api-reference.mdx +0 -723
- package/mintlify-docs/core/exulu-tool/configuration.mdx +0 -805
- package/mintlify-docs/core/exulu-tool/introduction.mdx +0 -539
- package/mintlify-docs/core/exulu-variables/api-reference.mdx +0 -699
- package/mintlify-docs/core/exulu-variables/configuration.mdx +0 -736
- package/mintlify-docs/core/exulu-variables/introduction.mdx +0 -511
- package/mintlify-docs/development.mdx +0 -94
- package/mintlify-docs/docs.json +0 -248
- package/mintlify-docs/enterprise-edition.mdx +0 -538
- package/mintlify-docs/essentials/code.mdx +0 -35
- package/mintlify-docs/essentials/images.mdx +0 -59
- package/mintlify-docs/essentials/markdown.mdx +0 -88
- package/mintlify-docs/essentials/navigation.mdx +0 -87
- package/mintlify-docs/essentials/reusable-snippets.mdx +0 -110
- package/mintlify-docs/essentials/settings.mdx +0 -318
- package/mintlify-docs/favicon.svg +0 -3
- package/mintlify-docs/frontend/introduction.mdx +0 -39
- package/mintlify-docs/getting-started.mdx +0 -267
- package/mintlify-docs/guides/custom-agent.mdx +0 -608
- package/mintlify-docs/guides/first-agent.mdx +0 -315
- package/mintlify-docs/images/admin_ui.png +0 -0
- package/mintlify-docs/images/contexts.png +0 -0
- package/mintlify-docs/images/create_agents.png +0 -0
- package/mintlify-docs/images/evals.png +0 -0
- package/mintlify-docs/images/graphql.png +0 -0
- package/mintlify-docs/images/graphql_api.png +0 -0
- package/mintlify-docs/images/hero-dark.png +0 -0
- package/mintlify-docs/images/hero-light.png +0 -0
- package/mintlify-docs/images/hero.png +0 -0
- package/mintlify-docs/images/knowledge_sources.png +0 -0
- package/mintlify-docs/images/mcp.png +0 -0
- package/mintlify-docs/images/scaling.png +0 -0
- package/mintlify-docs/index.mdx +0 -411
- package/mintlify-docs/logo/dark.svg +0 -9
- package/mintlify-docs/logo/light.svg +0 -9
- package/mintlify-docs/partners.mdx +0 -558
- package/mintlify-docs/products.mdx +0 -77
- package/mintlify-docs/snippets/snippet-intro.mdx +0 -4
- package/mintlify-docs/styles.css +0 -207
- package/ngrok.bash +0 -1
- package/ngrok.md +0 -6
- package/ngrok.yml +0 -10
- package/release.config.cjs +0 -15
- package/skills-lock.json +0 -10
- package/types/context-processor.ts +0 -45
- package/types/enums/eval-types.ts +0 -5
- package/types/enums/field-types.ts +0 -1
- package/types/enums/jobs.ts +0 -11
- package/types/enums/statistics.ts +0 -13
- package/types/exulu-table-definition.ts +0 -79
- package/types/file-types.ts +0 -18
- package/types/models/agent-session.ts +0 -27
- package/types/models/agent.ts +0 -68
- package/types/models/context.ts +0 -53
- package/types/models/embedding.ts +0 -17
- package/types/models/eval-run.ts +0 -40
- package/types/models/exulu-agent-tool-config.ts +0 -11
- package/types/models/item.ts +0 -21
- package/types/models/job.ts +0 -8
- package/types/models/project.ts +0 -16
- package/types/models/rate-limiter-rules.ts +0 -7
- package/types/models/test-case.ts +0 -25
- package/types/models/tool.ts +0 -9
- package/types/models/user-role.ts +0 -12
- package/types/models/user.ts +0 -20
- package/types/models/variable.ts +0 -8
- package/types/models/vector-methods.ts +0 -7
- package/types/provider-config.ts +0 -21
- package/types/queue-config.ts +0 -16
- package/types/rbac-rights-modes.ts +0 -1
- package/types/statistics.ts +0 -20
- package/types/workflow.ts +0 -31
- /package/ee/{documents → python/documents}/THIRD_PARTY_LICENSES/docling.txt +0 -0
- /package/ee/{documents/processing → python}/requirements.txt +0 -0
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
# Exulu Python Integration
|
|
2
|
+
|
|
3
|
+
This directory contains Python scripts and utilities used by the Exulu backend. The integration is designed to be seamless for TypeScript developers, requiring minimal Python knowledge.
|
|
4
|
+
|
|
5
|
+
## Quick Start
|
|
6
|
+
|
|
7
|
+
### First-Time Setup
|
|
8
|
+
|
|
9
|
+
Run the setup command to configure your Python environment:
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
npm run python:setup
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
This will:
|
|
16
|
+
- ✅ Validate Python 3.10+ is installed
|
|
17
|
+
- ✅ Create a virtual environment at `ee/python/.venv`
|
|
18
|
+
- ✅ Install all required dependencies
|
|
19
|
+
- ✅ Verify the installation
|
|
20
|
+
|
|
21
|
+
**That's it!** You're ready to use Python scripts from TypeScript.
|
|
22
|
+
|
|
23
|
+
## Available npm Scripts
|
|
24
|
+
|
|
25
|
+
| Command | Description |
|
|
26
|
+
|---------|-------------|
|
|
27
|
+
| `npm run python:setup` | Initial setup - creates venv and installs dependencies |
|
|
28
|
+
| `npm run python:install` | Install/update Python dependencies |
|
|
29
|
+
| `npm run python:validate` | Verify Python environment is working |
|
|
30
|
+
| `npm run python:clean` | Clean Python cache and virtual environment |
|
|
31
|
+
| `npm run python:rebuild` | Clean and rebuild Python environment from scratch |
|
|
32
|
+
|
|
33
|
+
## Using Python Scripts from TypeScript
|
|
34
|
+
|
|
35
|
+
The `python-executor` utility provides a type-safe interface for calling Python scripts:
|
|
36
|
+
|
|
37
|
+
### Basic Example
|
|
38
|
+
|
|
39
|
+
```typescript
|
|
40
|
+
import { executePythonScript } from '../utils/python-executor';
|
|
41
|
+
|
|
42
|
+
// Execute a Python script
|
|
43
|
+
const result = await executePythonScript({
|
|
44
|
+
scriptPath: 'ee/python/documents/processing/document_to_markdown.py',
|
|
45
|
+
args: [
|
|
46
|
+
'/path/to/document.pdf',
|
|
47
|
+
'-o', '/output/processed.json',
|
|
48
|
+
'--images-dir', '/output/images'
|
|
49
|
+
]
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
if (result.success) {
|
|
53
|
+
console.log('Script executed successfully!');
|
|
54
|
+
console.log('Output:', result.stdout);
|
|
55
|
+
} else {
|
|
56
|
+
console.error('Script failed:', result.stderr);
|
|
57
|
+
}
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### Simple Usage (throws on error)
|
|
61
|
+
|
|
62
|
+
```typescript
|
|
63
|
+
import { executePythonScriptSimple } from '../utils/python-executor';
|
|
64
|
+
|
|
65
|
+
// Get stdout directly, throws on error
|
|
66
|
+
const output = await executePythonScriptSimple({
|
|
67
|
+
scriptPath: 'ee/python/my_script.py',
|
|
68
|
+
args: ['arg1', 'arg2']
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
console.log('Output:', output);
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### Advanced Configuration
|
|
75
|
+
|
|
76
|
+
```typescript
|
|
77
|
+
import { executePythonScript } from '../utils/python-executor';
|
|
78
|
+
|
|
79
|
+
const result = await executePythonScript({
|
|
80
|
+
scriptPath: 'ee/python/my_script.py',
|
|
81
|
+
args: ['--verbose'],
|
|
82
|
+
cwd: process.cwd(),
|
|
83
|
+
timeout: 600000, // 10 minutes
|
|
84
|
+
env: {
|
|
85
|
+
CUSTOM_VAR: 'value'
|
|
86
|
+
},
|
|
87
|
+
validateEnvironment: true // default
|
|
88
|
+
});
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### Error Handling
|
|
92
|
+
|
|
93
|
+
```typescript
|
|
94
|
+
import {
|
|
95
|
+
executePythonScript,
|
|
96
|
+
PythonEnvironmentError,
|
|
97
|
+
PythonExecutionError
|
|
98
|
+
} from '../utils/python-executor';
|
|
99
|
+
|
|
100
|
+
try {
|
|
101
|
+
const result = await executePythonScript({
|
|
102
|
+
scriptPath: 'ee/python/my_script.py'
|
|
103
|
+
});
|
|
104
|
+
|
|
105
|
+
// Handle success
|
|
106
|
+
console.log(result.stdout);
|
|
107
|
+
|
|
108
|
+
} catch (error) {
|
|
109
|
+
if (error instanceof PythonEnvironmentError) {
|
|
110
|
+
// Python environment not set up
|
|
111
|
+
console.error('Please run: npm run python:setup');
|
|
112
|
+
} else if (error instanceof PythonExecutionError) {
|
|
113
|
+
// Script execution failed
|
|
114
|
+
console.error('Script error:', error.stderr);
|
|
115
|
+
console.error('Exit code:', error.exitCode);
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### Check Environment Status
|
|
121
|
+
|
|
122
|
+
```typescript
|
|
123
|
+
import { isPythonEnvironmentReady } from '../utils/python-executor';
|
|
124
|
+
|
|
125
|
+
if (await isPythonEnvironmentReady()) {
|
|
126
|
+
console.log('Python environment is ready!');
|
|
127
|
+
} else {
|
|
128
|
+
console.log('Please run: npm run python:setup');
|
|
129
|
+
}
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## Directory Structure
|
|
133
|
+
|
|
134
|
+
```
|
|
135
|
+
ee/python/
|
|
136
|
+
├── .venv/ # Virtual environment (gitignored)
|
|
137
|
+
├── requirements.txt # Python dependencies
|
|
138
|
+
├── setup.sh # Setup script
|
|
139
|
+
├── README.md # This file
|
|
140
|
+
└── documents/
|
|
141
|
+
└── processing/
|
|
142
|
+
├── document_to_markdown.py
|
|
143
|
+
└── ...
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
## Adding New Python Scripts
|
|
147
|
+
|
|
148
|
+
1. **Create your script** in an appropriate subdirectory under `ee/python/`
|
|
149
|
+
|
|
150
|
+
2. **Add dependencies** to `requirements.txt`:
|
|
151
|
+
```
|
|
152
|
+
your-package==1.2.3
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
3. **Update dependencies**:
|
|
156
|
+
```bash
|
|
157
|
+
npm run python:install
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
4. **Use from TypeScript**:
|
|
161
|
+
```typescript
|
|
162
|
+
import { executePythonScript } from '../utils/python-executor';
|
|
163
|
+
|
|
164
|
+
const result = await executePythonScript({
|
|
165
|
+
scriptPath: 'ee/python/your-module/your-script.py',
|
|
166
|
+
args: ['arg1', 'arg2']
|
|
167
|
+
});
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
## Troubleshooting
|
|
171
|
+
|
|
172
|
+
### Python environment not found
|
|
173
|
+
|
|
174
|
+
```bash
|
|
175
|
+
npm run python:setup
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
### Dependencies not installing
|
|
179
|
+
|
|
180
|
+
```bash
|
|
181
|
+
npm run python:rebuild
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
### Script execution fails
|
|
185
|
+
|
|
186
|
+
1. **Validate environment**:
|
|
187
|
+
```bash
|
|
188
|
+
npm run python:validate
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
2. **Check Python version**:
|
|
192
|
+
```bash
|
|
193
|
+
source ee/python/.venv/bin/activate
|
|
194
|
+
python --version # Should be 3.10+
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
3. **Test manually**:
|
|
198
|
+
```bash
|
|
199
|
+
source ee/python/.venv/bin/activate
|
|
200
|
+
python ee/python/your-script.py --help
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
### Import errors in Python scripts
|
|
204
|
+
|
|
205
|
+
Make sure all required packages are in `requirements.txt` and run:
|
|
206
|
+
```bash
|
|
207
|
+
npm run python:install
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
## Requirements
|
|
211
|
+
|
|
212
|
+
- **Python**: 3.10 or higher
|
|
213
|
+
- **pip**: Latest version (auto-upgraded during setup)
|
|
214
|
+
- **Operating System**: macOS, Linux, or Windows with WSL
|
|
215
|
+
|
|
216
|
+
### Installing Python
|
|
217
|
+
|
|
218
|
+
**macOS:**
|
|
219
|
+
```bash
|
|
220
|
+
brew install python@3.12
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
**Ubuntu/Debian:**
|
|
224
|
+
```bash
|
|
225
|
+
sudo apt-get update
|
|
226
|
+
sudo apt-get install python3.12 python3.12-venv python3-pip
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
**Windows:**
|
|
230
|
+
Download from [python.org](https://www.python.org/downloads/)
|
|
231
|
+
|
|
232
|
+
## CI/CD Integration
|
|
233
|
+
|
|
234
|
+
### GitHub Actions Example
|
|
235
|
+
|
|
236
|
+
```yaml
|
|
237
|
+
- name: Setup Python Environment
|
|
238
|
+
run: npm run python:setup
|
|
239
|
+
|
|
240
|
+
- name: Validate Python Environment
|
|
241
|
+
run: npm run python:validate
|
|
242
|
+
|
|
243
|
+
- name: Run Tests
|
|
244
|
+
run: npm test
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
### Caching Virtual Environment
|
|
248
|
+
|
|
249
|
+
```yaml
|
|
250
|
+
- uses: actions/cache@v3
|
|
251
|
+
with:
|
|
252
|
+
path: ee/python/.venv
|
|
253
|
+
key: ${{ runner.os }}-python-${{ hashFiles('ee/python/requirements.txt') }}
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
## Best Practices
|
|
257
|
+
|
|
258
|
+
1. **Always use the TypeScript wrapper** - Don't call Python directly with `exec()`
|
|
259
|
+
2. **Pin dependency versions** in `requirements.txt` for reproducibility
|
|
260
|
+
3. **Handle errors gracefully** - Use try/catch with specific error types
|
|
261
|
+
4. **Set appropriate timeouts** - Long-running scripts should have higher timeouts
|
|
262
|
+
5. **Log errors properly** - Both stdout and stderr should be logged
|
|
263
|
+
6. **Test environment setup** - Run `npm run python:validate` in CI
|
|
264
|
+
|
|
265
|
+
## Available Python Modules
|
|
266
|
+
|
|
267
|
+
### Document Processing
|
|
268
|
+
|
|
269
|
+
**Script:** `documents/processing/document_to_markdown.py`
|
|
270
|
+
|
|
271
|
+
Converts documents (PDF, DOCX, etc.) to structured JSON with markdown content.
|
|
272
|
+
|
|
273
|
+
**Usage:**
|
|
274
|
+
```typescript
|
|
275
|
+
import { executePythonScript } from '../utils/python-executor';
|
|
276
|
+
|
|
277
|
+
const result = await executePythonScript({
|
|
278
|
+
scriptPath: 'ee/python/documents/processing/document_to_markdown.py',
|
|
279
|
+
args: [
|
|
280
|
+
'/path/to/document.pdf',
|
|
281
|
+
'-o', '/output/processed.json',
|
|
282
|
+
'--images-dir', '/output/images'
|
|
283
|
+
]
|
|
284
|
+
});
|
|
285
|
+
|
|
286
|
+
const pages = JSON.parse(result.stdout);
|
|
287
|
+
```
|
|
288
|
+
|
|
289
|
+
## Support
|
|
290
|
+
|
|
291
|
+
For issues or questions:
|
|
292
|
+
1. Check this README
|
|
293
|
+
2. Run `npm run python:validate`
|
|
294
|
+
3. Check the [troubleshooting](#troubleshooting) section
|
|
295
|
+
4. Open an issue with logs from `npm run python:validate`
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
# Document Processing Module
|
|
2
|
+
|
|
3
|
+
Python scripts for processing documents (PDF, DOCX, etc.) into structured formats.
|
|
4
|
+
|
|
5
|
+
## document_to_markdown.py
|
|
6
|
+
|
|
7
|
+
Converts documents to structured JSON with page-separated markdown content and extracted images.
|
|
8
|
+
|
|
9
|
+
### Features
|
|
10
|
+
|
|
11
|
+
- ✅ PDF, DOCX, PPTX, and other document formats
|
|
12
|
+
- ✅ Page-by-page content extraction
|
|
13
|
+
- ✅ Hierarchical heading structure
|
|
14
|
+
- ✅ Image extraction with high resolution
|
|
15
|
+
- ✅ Table preservation in markdown format
|
|
16
|
+
- ✅ Normalized whitespace handling
|
|
17
|
+
|
|
18
|
+
### Usage from TypeScript
|
|
19
|
+
|
|
20
|
+
```typescript
|
|
21
|
+
import { executePythonScript } from '../../../../src/utils/python-executor';
|
|
22
|
+
import { readFile } from 'fs/promises';
|
|
23
|
+
import { join } from 'path';
|
|
24
|
+
|
|
25
|
+
async function processDocument(documentPath: string, outputDir: string) {
|
|
26
|
+
try {
|
|
27
|
+
// Execute the document processor
|
|
28
|
+
const result = await executePythonScript({
|
|
29
|
+
scriptPath: 'ee/python/documents/processing/document_to_markdown.py',
|
|
30
|
+
args: [
|
|
31
|
+
documentPath,
|
|
32
|
+
'-o', join(outputDir, 'processed.json'),
|
|
33
|
+
'--images-dir', join(outputDir, 'images')
|
|
34
|
+
],
|
|
35
|
+
timeout: 600000 // 10 minutes for large documents
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
if (!result.success) {
|
|
39
|
+
throw new Error(`Processing failed: ${result.stderr}`);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// Read the processed JSON
|
|
43
|
+
const processedData = JSON.parse(
|
|
44
|
+
await readFile(join(outputDir, 'processed.json'), 'utf-8')
|
|
45
|
+
);
|
|
46
|
+
|
|
47
|
+
return processedData;
|
|
48
|
+
} catch (error) {
|
|
49
|
+
console.error('Document processing error:', error);
|
|
50
|
+
throw error;
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// Example usage
|
|
55
|
+
const pages = await processDocument(
|
|
56
|
+
'/path/to/document.pdf',
|
|
57
|
+
'/output/directory'
|
|
58
|
+
);
|
|
59
|
+
|
|
60
|
+
// Access page content
|
|
61
|
+
pages.forEach((page, index) => {
|
|
62
|
+
console.log(`Page ${page.page}:`);
|
|
63
|
+
console.log(`Content: ${page.content.substring(0, 100)}...`);
|
|
64
|
+
console.log(`Image: ${page.image || 'None'}`);
|
|
65
|
+
console.log(`Headings:`, page.headings);
|
|
66
|
+
});
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Command-Line Usage
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
# Activate virtual environment
|
|
73
|
+
source ee/python/.venv/bin/activate
|
|
74
|
+
|
|
75
|
+
# Process a document
|
|
76
|
+
python ee/python/documents/processing/document_to_markdown.py \
|
|
77
|
+
/path/to/document.pdf \
|
|
78
|
+
-o /output/processed.json \
|
|
79
|
+
--images-dir /output/images
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### Output Format
|
|
83
|
+
|
|
84
|
+
The script outputs a JSON array with page objects:
|
|
85
|
+
|
|
86
|
+
```json
|
|
87
|
+
[
|
|
88
|
+
{
|
|
89
|
+
"page": 1,
|
|
90
|
+
"content": "# Document Title\n\nFirst paragraph...",
|
|
91
|
+
"image": "/output/images/page_1.png",
|
|
92
|
+
"headings": {
|
|
93
|
+
"Document Title": null
|
|
94
|
+
}
|
|
95
|
+
},
|
|
96
|
+
{
|
|
97
|
+
"page": 2,
|
|
98
|
+
"content": "## Section 1\n\nMore content...",
|
|
99
|
+
"image": "/output/images/page_2.png",
|
|
100
|
+
"headings": {
|
|
101
|
+
"Document Title": {
|
|
102
|
+
"Section 1": null
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
]
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Arguments
|
|
110
|
+
|
|
111
|
+
| Argument | Description | Required |
|
|
112
|
+
|----------|-------------|----------|
|
|
113
|
+
| `pdf_path` | Path to the document file | Yes |
|
|
114
|
+
| `-o, --output` | Output path for JSON file | No (default: `<document_name>/processed.json`) |
|
|
115
|
+
| `--images-dir` | Directory to save page images | No (default: `<output_dir>/images`) |
|
|
116
|
+
|
|
117
|
+
### Configuration
|
|
118
|
+
|
|
119
|
+
You can modify these constants in the script:
|
|
120
|
+
|
|
121
|
+
```python
|
|
122
|
+
IMAGE_RESOLUTION_SCALE = 2.0 # Image resolution multiplier
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### Dependencies
|
|
126
|
+
|
|
127
|
+
This script requires the following Python packages (installed via `npm run python:setup`):
|
|
128
|
+
|
|
129
|
+
- `docling` - Document conversion
|
|
130
|
+
- `docling-hierarchical-pdf` - Hierarchical heading processing
|
|
131
|
+
- `transformers` - ML-based text processing
|
|
132
|
+
- `PIL` - Image handling
|
|
133
|
+
|
|
134
|
+
### Troubleshooting
|
|
135
|
+
|
|
136
|
+
**Issue: ImportError for docling**
|
|
137
|
+
```bash
|
|
138
|
+
npm run python:install
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
**Issue: Script timeout for large documents**
|
|
142
|
+
```typescript
|
|
143
|
+
// Increase timeout
|
|
144
|
+
const result = await executePythonScript({
|
|
145
|
+
scriptPath: '...',
|
|
146
|
+
timeout: 1200000 // 20 minutes
|
|
147
|
+
});
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
**Issue: Low-quality images**
|
|
151
|
+
|
|
152
|
+
Increase `IMAGE_RESOLUTION_SCALE` in the script:
|
|
153
|
+
```python
|
|
154
|
+
IMAGE_RESOLUTION_SCALE = 3.0 # Higher quality
|
|
155
|
+
```
|
|
@@ -13,8 +13,7 @@ import TurndownService from 'turndown';
|
|
|
13
13
|
import WordExtractor from 'word-extractor';
|
|
14
14
|
import { parseOfficeAsync } from "officeparser";
|
|
15
15
|
import { checkLicense } from '@EE/entitlements';
|
|
16
|
-
|
|
17
|
-
const execAsync = promisify(exec);
|
|
16
|
+
import { executePythonScript } from '@SRC/utils/python-executor';
|
|
18
17
|
|
|
19
18
|
type DocumentProcessorConfig = {
|
|
20
19
|
vlm?: {
|
|
@@ -430,19 +429,28 @@ async function processPdf(
|
|
|
430
429
|
): Promise<ProcessorOutput> {
|
|
431
430
|
try {
|
|
432
431
|
let json: ProcessedDocument;
|
|
433
|
-
// Call the PDF processor
|
|
432
|
+
// Call the PDF processor script
|
|
434
433
|
if (config?.docling) {
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
const
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
434
|
+
|
|
435
|
+
console.log(`[EXULU] Processing document with document_to_markdown.py`);
|
|
436
|
+
|
|
437
|
+
const result = await executePythonScript({
|
|
438
|
+
scriptPath: 'ee/python/documents/processing/document_to_markdown.py',
|
|
439
|
+
args: [
|
|
440
|
+
paths.source,
|
|
441
|
+
'-o', paths.json,
|
|
442
|
+
'--images-dir', paths.images
|
|
443
|
+
],
|
|
444
|
+
timeout: 30 * 60 * 1000, // 30 minutes for large documents
|
|
445
|
+
});
|
|
446
|
+
|
|
447
|
+
// Log processing info from stderr
|
|
448
|
+
if (result.stderr) {
|
|
449
|
+
console.log('Processing info:', result.stderr.trim());
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
if (!result.success) {
|
|
453
|
+
throw new Error(`Document processing failed: ${result.stderr}`);
|
|
446
454
|
}
|
|
447
455
|
|
|
448
456
|
// Read the generated JSON file
|
|
@@ -460,9 +468,9 @@ async function processPdf(
|
|
|
460
468
|
}];
|
|
461
469
|
}
|
|
462
470
|
|
|
463
|
-
console.log(
|
|
464
|
-
console.log(`
|
|
465
|
-
console.log(`
|
|
471
|
+
console.log(`[EXULU] \n✓ Document processing completed successfully`);
|
|
472
|
+
console.log(`[EXULU] Total pages: ${json.length}`);
|
|
473
|
+
console.log(`[EXULU] Output file: ${paths.json}`);
|
|
466
474
|
|
|
467
475
|
if (!config?.docling && config?.vlm?.model) {
|
|
468
476
|
console.error('[EXULU] VLM validation is only supported when docling is enabled, skipping validation.');
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
"""
|
|
3
|
-
|
|
4
|
-
Converts a
|
|
3
|
+
Document to Markdown Converter using Docling
|
|
4
|
+
Converts a document to JSON with page-separated markdown and images.
|
|
5
5
|
|
|
6
6
|
Usage:
|
|
7
|
-
|
|
7
|
+
document_to_markdown.py <document_file_path> [-o OUTPUT_PATH] [--images-dir IMAGES_DIR]
|
|
8
8
|
"""
|
|
9
9
|
|
|
10
10
|
import sys
|
|
@@ -346,13 +346,6 @@ def main():
|
|
|
346
346
|
help='Directory to save page images (default: <pdf_name>_images/)'
|
|
347
347
|
)
|
|
348
348
|
|
|
349
|
-
parser.add_argument(
|
|
350
|
-
'--max-tokens',
|
|
351
|
-
type=int,
|
|
352
|
-
dest='max_tokens',
|
|
353
|
-
help='Maximum number of tokens (currently not used, reserved for future use)'
|
|
354
|
-
)
|
|
355
|
-
|
|
356
349
|
# Parse arguments
|
|
357
350
|
args = parser.parse_args()
|
|
358
351
|
|