@aj-archipelago/cortex 1.3.67 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config.js +27 -0
- package/helper-apps/cortex-doc-to-pdf/DocToPdfFunction/__init__.py +3 -0
- package/helper-apps/cortex-doc-to-pdf/DocToPdfFunction/function.json +20 -0
- package/helper-apps/cortex-doc-to-pdf/Dockerfile +46 -0
- package/helper-apps/cortex-doc-to-pdf/README.md +408 -0
- package/helper-apps/cortex-doc-to-pdf/converter.py +157 -0
- package/helper-apps/cortex-doc-to-pdf/docker-compose.yml +23 -0
- package/helper-apps/cortex-doc-to-pdf/document_converter.py +181 -0
- package/helper-apps/cortex-doc-to-pdf/examples/README.md +252 -0
- package/helper-apps/cortex-doc-to-pdf/examples/nodejs-client.js +266 -0
- package/helper-apps/cortex-doc-to-pdf/examples/package-lock.json +297 -0
- package/helper-apps/cortex-doc-to-pdf/examples/package.json +23 -0
- package/helper-apps/cortex-doc-to-pdf/function_app.py +85 -0
- package/helper-apps/cortex-doc-to-pdf/host.json +16 -0
- package/helper-apps/cortex-doc-to-pdf/request_handlers.py +193 -0
- package/helper-apps/cortex-doc-to-pdf/requirements.txt +3 -0
- package/helper-apps/cortex-doc-to-pdf/tests/run_tests.sh +26 -0
- package/helper-apps/cortex-doc-to-pdf/tests/test_conversion.py +320 -0
- package/helper-apps/cortex-doc-to-pdf/tests/test_streaming.py +419 -0
- package/helper-apps/cortex-file-handler/package-lock.json +1 -0
- package/helper-apps/cortex-file-handler/package.json +1 -0
- package/helper-apps/cortex-file-handler/src/services/ConversionService.js +81 -8
- package/helper-apps/cortex-file-handler/tests/FileConversionService.test.js +54 -7
- package/helper-apps/cortex-file-handler/tests/getOperations.test.js +19 -7
- package/lib/encodeCache.js +5 -0
- package/lib/keyValueStorageClient.js +5 -0
- package/lib/logger.js +1 -1
- package/lib/pathwayTools.js +8 -1
- package/lib/redisSubscription.js +6 -0
- package/lib/requestExecutor.js +4 -0
- package/lib/util.js +88 -0
- package/package.json +1 -1
- package/pathways/basePathway.js +3 -3
- package/pathways/bing_afagent.js +1 -0
- package/pathways/gemini_15_vision.js +1 -1
- package/pathways/google_cse.js +2 -2
- package/pathways/image_gemini_25.js +85 -0
- package/pathways/image_prompt_optimizer_gemini_25.js +149 -0
- package/pathways/image_qwen.js +28 -0
- package/pathways/image_seedream4.js +26 -0
- package/pathways/rag.js +1 -1
- package/pathways/rag_jarvis.js +1 -1
- package/pathways/system/entity/sys_entity_continue.js +1 -1
- package/pathways/system/entity/sys_generator_results.js +1 -1
- package/pathways/system/entity/tools/sys_tool_google_search.js +15 -2
- package/pathways/system/entity/tools/sys_tool_grok_x_search.js +3 -3
- package/pathways/system/entity/tools/sys_tool_image.js +28 -23
- package/pathways/system/entity/tools/sys_tool_image_gemini.js +135 -0
- package/server/graphql.js +9 -2
- package/server/modelExecutor.js +4 -0
- package/server/pathwayResolver.js +19 -18
- package/server/plugins/claude3VertexPlugin.js +13 -8
- package/server/plugins/gemini15ChatPlugin.js +15 -10
- package/server/plugins/gemini15VisionPlugin.js +2 -23
- package/server/plugins/gemini25ImagePlugin.js +155 -0
- package/server/plugins/modelPlugin.js +3 -2
- package/server/plugins/openAiChatPlugin.js +6 -6
- package/server/plugins/replicateApiPlugin.js +268 -12
- package/server/plugins/veoVideoPlugin.js +15 -1
- package/server/rest.js +2 -0
- package/server/typeDef.js +96 -10
- package/tests/integration/apptekTranslatePlugin.integration.test.js +1 -1
- package/tests/unit/core/pathwayManager.test.js +2 -4
- package/tests/unit/plugins/gemini25ImagePlugin.test.js +294 -0
package/config.js
CHANGED
|
@@ -423,6 +423,33 @@ var config = convict({
|
|
|
423
423
|
"Content-Type": "application/json"
|
|
424
424
|
},
|
|
425
425
|
},
|
|
426
|
+
"replicate-qwen-image": {
|
|
427
|
+
"type": "REPLICATE-API",
|
|
428
|
+
"url": "https://api.replicate.com/v1/models/qwen/qwen-image/predictions",
|
|
429
|
+
"headers": {
|
|
430
|
+
"Prefer": "wait",
|
|
431
|
+
"Authorization": "Token {{REPLICATE_API_KEY}}",
|
|
432
|
+
"Content-Type": "application/json"
|
|
433
|
+
},
|
|
434
|
+
},
|
|
435
|
+
"replicate-qwen-image-edit-plus": {
|
|
436
|
+
"type": "REPLICATE-API",
|
|
437
|
+
"url": "https://api.replicate.com/v1/models/qwen/qwen-image-edit-plus/predictions",
|
|
438
|
+
"headers": {
|
|
439
|
+
"Prefer": "wait",
|
|
440
|
+
"Authorization": "Token {{REPLICATE_API_KEY}}",
|
|
441
|
+
"Content-Type": "application/json"
|
|
442
|
+
},
|
|
443
|
+
},
|
|
444
|
+
"replicate-seedream-4": {
|
|
445
|
+
"type": "REPLICATE-API",
|
|
446
|
+
"url": "https://api.replicate.com/v1/models/bytedance/seedream-4/predictions",
|
|
447
|
+
"headers": {
|
|
448
|
+
"Prefer": "wait",
|
|
449
|
+
"Authorization": "Token {{REPLICATE_API_KEY}}",
|
|
450
|
+
"Content-Type": "application/json"
|
|
451
|
+
},
|
|
452
|
+
},
|
|
426
453
|
"azure-video-translate": {
|
|
427
454
|
"type": "AZURE-VIDEO-TRANSLATE",
|
|
428
455
|
"url": "https://eastus.api.cognitive.microsoft.com/videotranslation",
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
{
|
|
2
|
+
"scriptFile": "__init__.py",
|
|
3
|
+
"bindings": [
|
|
4
|
+
{
|
|
5
|
+
"authLevel": "function",
|
|
6
|
+
"type": "httpTrigger",
|
|
7
|
+
"direction": "in",
|
|
8
|
+
"name": "req",
|
|
9
|
+
"methods": [
|
|
10
|
+
"get",
|
|
11
|
+
"post"
|
|
12
|
+
]
|
|
13
|
+
},
|
|
14
|
+
{
|
|
15
|
+
"type": "http",
|
|
16
|
+
"direction": "out",
|
|
17
|
+
"name": "$return"
|
|
18
|
+
}
|
|
19
|
+
]
|
|
20
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# Use official Azure Functions Python base image
|
|
2
|
+
FROM mcr.microsoft.com/azure-functions/python:4-python3.11
|
|
3
|
+
|
|
4
|
+
# Install LibreOffice + UNO Python bindings and unoconv for fast conversions
|
|
5
|
+
RUN apt-get update && \
|
|
6
|
+
apt-get install -y \
|
|
7
|
+
libreoffice \
|
|
8
|
+
libreoffice-writer \
|
|
9
|
+
libreoffice-calc \
|
|
10
|
+
libreoffice-impress \
|
|
11
|
+
python3-uno \
|
|
12
|
+
unoconv \
|
|
13
|
+
fonts-liberation \
|
|
14
|
+
fonts-dejavu \
|
|
15
|
+
fonts-liberation2 \
|
|
16
|
+
fonts-noto \
|
|
17
|
+
fonts-noto-cjk \
|
|
18
|
+
fonts-noto-color-emoji \
|
|
19
|
+
curl \
|
|
20
|
+
&& apt-get clean \
|
|
21
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
22
|
+
|
|
23
|
+
# Set LibreOffice to use headless mode by default
|
|
24
|
+
ENV SAL_USE_VCLPLUGIN=svp
|
|
25
|
+
|
|
26
|
+
# Set the working directory
|
|
27
|
+
ENV AzureWebJobsScriptRoot=/home/site/wwwroot \
|
|
28
|
+
AzureFunctionsJobHost__Logging__Console__IsEnabled=true \
|
|
29
|
+
FUNCTIONS_WORKER_RUNTIME=python
|
|
30
|
+
|
|
31
|
+
# Copy requirements first for better layer caching
|
|
32
|
+
WORKDIR /home/site/wwwroot
|
|
33
|
+
COPY requirements.txt .
|
|
34
|
+
|
|
35
|
+
# Install Python dependencies
|
|
36
|
+
RUN pip install --no-cache-dir --upgrade pip && \
|
|
37
|
+
pip install --no-cache-dir -r requirements.txt
|
|
38
|
+
|
|
39
|
+
# Copy function app files
|
|
40
|
+
COPY . .
|
|
41
|
+
|
|
42
|
+
# Expose port for standalone server mode
|
|
43
|
+
EXPOSE 8080
|
|
44
|
+
|
|
45
|
+
# Start the application directly (no UNO listener)
|
|
46
|
+
CMD ["python", "function_app.py"]
|
|
@@ -0,0 +1,408 @@
|
|
|
1
|
+
# Document to PDF Converter - Azure Container App
|
|
2
|
+
|
|
3
|
+
A comprehensive document-to-PDF conversion service that runs as both an Azure Function and a standalone HTTP server. Built with LibreOffice, it supports **40+ document formats** including Word, Excel, PowerPoint, text files, HTML, and more.
|
|
4
|
+
|
|
5
|
+
## Quick Start
|
|
6
|
+
|
|
7
|
+
### Using Docker Compose (Easiest)
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
# Build and start the service
|
|
11
|
+
docker compose up --build -d
|
|
12
|
+
|
|
13
|
+
# Test the service
|
|
14
|
+
curl "http://localhost:8080/convert?uri=https://file-examples.com/storage/fe783f04fc66761fd44fb46/2017/02/file-sample_100kB.doc" -o test.pdf
|
|
15
|
+
|
|
16
|
+
# Check health
|
|
17
|
+
curl http://localhost:8080/health
|
|
18
|
+
|
|
19
|
+
# View logs
|
|
20
|
+
docker compose logs -f
|
|
21
|
+
|
|
22
|
+
# Stop the service
|
|
23
|
+
docker compose down
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Supported Formats
|
|
27
|
+
|
|
28
|
+
### Microsoft Office
|
|
29
|
+
- **Word**: `.doc`, `.docx`, `.docm`, `.dot`, `.dotx`, `.dotm`
|
|
30
|
+
- **Excel**: `.xls`, `.xlsx`, `.xlsm`, `.xlt`, `.xltx`, `.xltm`, `.csv`
|
|
31
|
+
- **PowerPoint**: `.ppt`, `.pptx`, `.pptm`, `.pot`, `.potx`, `.potm`, `.pps`, `.ppsx`, `.ppsm`
|
|
32
|
+
|
|
33
|
+
### OpenDocument
|
|
34
|
+
- **Text**: `.odt`, `.ott`
|
|
35
|
+
- **Spreadsheet**: `.ods`, `.ots`
|
|
36
|
+
- **Presentation**: `.odp`, `.otp`
|
|
37
|
+
- **Graphics**: `.odg`, `.otg`
|
|
38
|
+
|
|
39
|
+
### Web & Text
|
|
40
|
+
- **Web**: `.html`, `.htm`, `.xhtml`
|
|
41
|
+
- **Text**: `.txt`, `.rtf`, `.xml`
|
|
42
|
+
|
|
43
|
+
### Legacy Formats
|
|
44
|
+
- WordPerfect, Lotus 1-2-3, dBase files, and more
|
|
45
|
+
|
|
46
|
+
## API Usage
|
|
47
|
+
|
|
48
|
+
### Endpoints
|
|
49
|
+
|
|
50
|
+
**Standalone Server Mode** (Docker):
|
|
51
|
+
- `GET/POST /convert` - Convert document to PDF
|
|
52
|
+
- `GET /health` - Health check
|
|
53
|
+
|
|
54
|
+
**Azure Function Mode**:
|
|
55
|
+
- `GET/POST /api/convert` - Convert document to PDF
|
|
56
|
+
|
|
57
|
+
### Convert Document
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
# GET request
|
|
61
|
+
curl "http://localhost:8080/convert?uri=https://example.com/document.docx" -o output.pdf
|
|
62
|
+
|
|
63
|
+
# POST request
|
|
64
|
+
curl -X POST http://localhost:8080/convert \
|
|
65
|
+
-H "Content-Type: application/json" \
|
|
66
|
+
-d '{"uri": "https://example.com/document.xlsx"}' \
|
|
67
|
+
-o output.pdf
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### Response
|
|
71
|
+
|
|
72
|
+
**Success (200)**:
|
|
73
|
+
- Content-Type: `application/pdf`
|
|
74
|
+
- Body: PDF binary data
|
|
75
|
+
|
|
76
|
+
**Error (400/500)**:
|
|
77
|
+
```json
|
|
78
|
+
{
|
|
79
|
+
"error": "Error type",
|
|
80
|
+
"details": "Error details"
|
|
81
|
+
}
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Testing
|
|
85
|
+
|
|
86
|
+
### Run Tests
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
# Run conversion tests
|
|
90
|
+
python3 tests/test_conversion.py
|
|
91
|
+
|
|
92
|
+
# Run streaming tests
|
|
93
|
+
python3 tests/test_streaming.py
|
|
94
|
+
|
|
95
|
+
# Or run in Docker
|
|
96
|
+
docker compose run --rm doc-to-pdf python3 tests/test_streaming.py
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Tests verify:
|
|
100
|
+
- ✅ File upload streaming (memory efficient)
|
|
101
|
+
- ✅ URI-based conversion
|
|
102
|
+
- ✅ Streaming downloads
|
|
103
|
+
- ✅ Concurrent conversions
|
|
104
|
+
- ✅ Error handling
|
|
105
|
+
- ✅ All document formats
|
|
106
|
+
|
|
107
|
+
Sample files are in the `samples/` directory.
|
|
108
|
+
|
|
109
|
+
## Deployment
|
|
110
|
+
|
|
111
|
+
### Azure Container Apps (Recommended)
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
# Create resources
|
|
115
|
+
az group create --name cortex-rg --location eastus
|
|
116
|
+
az acr create --resource-group cortex-rg --name cortexregistry --sku Basic
|
|
117
|
+
|
|
118
|
+
# Build and push
|
|
119
|
+
az acr build --registry cortexregistry --image cortex-doc-to-pdf:latest .
|
|
120
|
+
|
|
121
|
+
# Create container app environment
|
|
122
|
+
az containerapp env create \
|
|
123
|
+
--name cortex-env \
|
|
124
|
+
--resource-group cortex-rg \
|
|
125
|
+
--location eastus
|
|
126
|
+
|
|
127
|
+
# Deploy
|
|
128
|
+
az containerapp create \
|
|
129
|
+
--name cortex-doc-to-pdf \
|
|
130
|
+
--resource-group cortex-rg \
|
|
131
|
+
--environment cortex-env \
|
|
132
|
+
--image cortexregistry.azurecr.io/cortex-doc-to-pdf:latest \
|
|
133
|
+
--target-port 8080 \
|
|
134
|
+
--ingress external \
|
|
135
|
+
--min-replicas 1 \
|
|
136
|
+
--max-replicas 10 \
|
|
137
|
+
--cpu 1.0 \
|
|
138
|
+
--memory 2.0Gi \
|
|
139
|
+
--env-vars PORT=8080 \
|
|
140
|
+
--command python function_app.py
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### Azure Function App
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
# Create Function App
|
|
147
|
+
az functionapp create \
|
|
148
|
+
--resource-group cortex-rg \
|
|
149
|
+
--name cortex-doc-to-pdf-func \
|
|
150
|
+
--storage-account cortexstorage \
|
|
151
|
+
--runtime python \
|
|
152
|
+
--runtime-version 3.11 \
|
|
153
|
+
--functions-version 4 \
|
|
154
|
+
--os-type Linux
|
|
155
|
+
|
|
156
|
+
# Deploy
|
|
157
|
+
func azure functionapp publish cortex-doc-to-pdf-func
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## Local Development
|
|
161
|
+
|
|
162
|
+
### With Docker (Recommended)
|
|
163
|
+
|
|
164
|
+
```bash
|
|
165
|
+
# Start with auto-reload
|
|
166
|
+
docker compose up
|
|
167
|
+
|
|
168
|
+
# Rebuild after code changes
|
|
169
|
+
docker compose up --build
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
### Without Docker
|
|
173
|
+
|
|
174
|
+
```bash
|
|
175
|
+
# Install LibreOffice
|
|
176
|
+
# macOS: brew install --cask libreoffice
|
|
177
|
+
# Ubuntu: sudo apt-get install libreoffice
|
|
178
|
+
|
|
179
|
+
# Install dependencies
|
|
180
|
+
pip install -r requirements.txt
|
|
181
|
+
|
|
182
|
+
# Run server
|
|
183
|
+
python function_app.py
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
## Configuration
|
|
187
|
+
|
|
188
|
+
### Environment Variables
|
|
189
|
+
|
|
190
|
+
| Variable | Description | Default |
|
|
191
|
+
|----------|-------------|---------|
|
|
192
|
+
| `PORT` | HTTP server port | `8080` |
|
|
193
|
+
| `AzureWebJobsStorage` | Azure Storage (Function mode) | - |
|
|
194
|
+
| `FUNCTIONS_WORKER_RUNTIME` | Azure Functions runtime | `python` |
|
|
195
|
+
|
|
196
|
+
### Conversion Timeout
|
|
197
|
+
|
|
198
|
+
Adjust in `host.json`:
|
|
199
|
+
```json
|
|
200
|
+
{
|
|
201
|
+
"functionTimeout": "00:10:00"
|
|
202
|
+
}
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
## Performance
|
|
206
|
+
|
|
207
|
+
### Typical Conversion Times
|
|
208
|
+
- Simple documents: 1-3 seconds
|
|
209
|
+
- Complex documents: 3-10 seconds
|
|
210
|
+
- Large presentations: 10-30 seconds
|
|
211
|
+
|
|
212
|
+
### Resource Requirements
|
|
213
|
+
- **CPU**: 1.0 vCPU minimum
|
|
214
|
+
- **Memory**: 2.0 GB minimum
|
|
215
|
+
- **Disk**: Ephemeral storage
|
|
216
|
+
|
|
217
|
+
## Troubleshooting
|
|
218
|
+
|
|
219
|
+
### Docker Build Issues
|
|
220
|
+
|
|
221
|
+
**Platform mismatch error** (Apple Silicon Macs):
|
|
222
|
+
```bash
|
|
223
|
+
# Already configured in docker-compose.yml
|
|
224
|
+
platform: linux/amd64
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
**Disk space error**:
|
|
228
|
+
```bash
|
|
229
|
+
docker system prune -a -f
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
### Conversion Failures
|
|
233
|
+
|
|
234
|
+
**Check logs**:
|
|
235
|
+
```bash
|
|
236
|
+
docker compose logs -f
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
**Test with known-good file**:
|
|
240
|
+
```bash
|
|
241
|
+
curl "http://localhost:8080/convert?uri=https://file-examples.com/storage/fe783f04fc66761fd44fb46/2017/02/file-sample_100kB.doc" -o test.pdf
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
## Project Structure
|
|
245
|
+
|
|
246
|
+
```
|
|
247
|
+
cortex-doc-to-pdf/
|
|
248
|
+
├── function_app.py # Main entry point & routing
|
|
249
|
+
├── request_handlers.py # HTTP request/response handling
|
|
250
|
+
├── document_converter.py # Conversion business logic
|
|
251
|
+
├── converter.py # LibreOffice wrapper
|
|
252
|
+
├── tests/ # Test suite
|
|
253
|
+
│ ├── test_streaming.py # Streaming tests
|
|
254
|
+
│ ├── test_conversion.py # Conversion tests
|
|
255
|
+
│ └── run_tests.sh # Test runner
|
|
256
|
+
├── samples/ # Sample documents
|
|
257
|
+
├── Dockerfile # Container image
|
|
258
|
+
├── docker-compose.yml # Local orchestration
|
|
259
|
+
├── requirements.txt # Dependencies
|
|
260
|
+
└── README.md # Documentation
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
## Examples
|
|
264
|
+
|
|
265
|
+
### URI-Based Conversion
|
|
266
|
+
|
|
267
|
+
```bash
|
|
268
|
+
# Word document via URI
|
|
269
|
+
curl -X POST http://localhost:8080/convert \
|
|
270
|
+
-H "Content-Type: application/json" \
|
|
271
|
+
-d '{"uri": "https://example.com/document.docx"}' \
|
|
272
|
+
-o output.pdf
|
|
273
|
+
|
|
274
|
+
# Or with GET (URL-encode special characters)
|
|
275
|
+
curl "http://localhost:8080/convert?uri=https://example.com/file.xlsx" -o output.pdf
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
### File Upload (Recommended - Streaming)
|
|
279
|
+
|
|
280
|
+
Upload local files directly - **no need for remote URI**. Files are streamed in 8KB chunks for memory efficiency.
|
|
281
|
+
|
|
282
|
+
```bash
|
|
283
|
+
# Upload file directly (streams upload & download)
|
|
284
|
+
curl -X POST http://localhost:8080/convert \
|
|
285
|
+
-F "file=@document.xlsx" \
|
|
286
|
+
-o output.pdf
|
|
287
|
+
|
|
288
|
+
# Works with any supported format
|
|
289
|
+
curl -X POST http://localhost:8080/convert \
|
|
290
|
+
-F "file=@presentation.pptx" \
|
|
291
|
+
-o slides.pdf
|
|
292
|
+
|
|
293
|
+
# Large files are handled efficiently (no memory bloat)
|
|
294
|
+
curl -X POST http://localhost:8080/convert \
|
|
295
|
+
-F "file=@large-spreadsheet.xlsx" \
|
|
296
|
+
-o output.pdf
|
|
297
|
+
```
|
|
298
|
+
|
|
299
|
+
**Why file upload is recommended:**
|
|
300
|
+
- ✅ **Streaming**: Chunked upload (8KB) and download
|
|
301
|
+
- ✅ **Memory efficient**: Handles large files without RAM bloat
|
|
302
|
+
- ✅ **Direct**: No need to host files on a server first
|
|
303
|
+
- ✅ **Fast**: No download step required
|
|
304
|
+
|
|
305
|
+
### JavaScript/Node.js Example
|
|
306
|
+
|
|
307
|
+
**See `examples/` folder for complete working examples!**
|
|
308
|
+
|
|
309
|
+
```javascript
|
|
310
|
+
const axios = require('axios');
|
|
311
|
+
const fs = require('fs');
|
|
312
|
+
const FormData = require('form-data');
|
|
313
|
+
|
|
314
|
+
// Upload file with streaming (both upload & download)
|
|
315
|
+
async function convertToPDF(inputFile, outputFile) {
|
|
316
|
+
const form = new FormData();
|
|
317
|
+
form.append('file', fs.createReadStream(inputFile));
|
|
318
|
+
|
|
319
|
+
const response = await axios({
|
|
320
|
+
method: 'POST',
|
|
321
|
+
url: 'http://localhost:8080/', // Can use / or /convert
|
|
322
|
+
data: form,
|
|
323
|
+
headers: form.getHeaders(),
|
|
324
|
+
responseType: 'stream', // Stream the response
|
|
325
|
+
});
|
|
326
|
+
|
|
327
|
+
// Stream PDF to file
|
|
328
|
+
const writer = fs.createWriteStream(outputFile);
|
|
329
|
+
response.data.pipe(writer);
|
|
330
|
+
|
|
331
|
+
return new Promise((resolve, reject) => {
|
|
332
|
+
writer.on('finish', resolve);
|
|
333
|
+
writer.on('error', reject);
|
|
334
|
+
});
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
// Usage
|
|
338
|
+
await convertToPDF('document.docx', 'output.pdf');
|
|
339
|
+
```
|
|
340
|
+
|
|
341
|
+
**Features:**
|
|
342
|
+
- ✅ Streaming upload (memory efficient)
|
|
343
|
+
- ✅ Streaming download (direct to file)
|
|
344
|
+
- ✅ Progress tracking support
|
|
345
|
+
- ✅ Works in Node.js and Browser
|
|
346
|
+
|
|
347
|
+
**Full example:** See `examples/nodejs-client.js`
|
|
348
|
+
|
|
349
|
+
### Python Example
|
|
350
|
+
|
|
351
|
+
```python
|
|
352
|
+
import requests
|
|
353
|
+
|
|
354
|
+
# Method 1: Upload local file (Recommended - Streaming)
|
|
355
|
+
def convert_file_to_pdf(file_path):
|
|
356
|
+
with open(file_path, 'rb') as f:
|
|
357
|
+
files = {'file': f}
|
|
358
|
+
response = requests.post(
|
|
359
|
+
'http://localhost:8080/convert',
|
|
360
|
+
files=files
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
if response.status_code == 200:
|
|
364
|
+
with open('output.pdf', 'wb') as f:
|
|
365
|
+
f.write(response.content)
|
|
366
|
+
print('✓ PDF created successfully')
|
|
367
|
+
else:
|
|
368
|
+
print(f'✗ Error: {response.json()}')
|
|
369
|
+
|
|
370
|
+
# Method 2: Convert from URI
|
|
371
|
+
def convert_url_to_pdf(document_url):
|
|
372
|
+
response = requests.post(
|
|
373
|
+
'http://localhost:8080/convert',
|
|
374
|
+
json={'uri': document_url}
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
if response.status_code == 200:
|
|
378
|
+
with open('output.pdf', 'wb') as f:
|
|
379
|
+
f.write(response.content)
|
|
380
|
+
print('✓ PDF created successfully')
|
|
381
|
+
else:
|
|
382
|
+
print(f'✗ Error: {response.json()}')
|
|
383
|
+
|
|
384
|
+
# Upload local file (streams efficiently)
|
|
385
|
+
convert_file_to_pdf('document.docx')
|
|
386
|
+
|
|
387
|
+
# Or convert from URL
|
|
388
|
+
convert_url_to_pdf('https://example.com/document.docx')
|
|
389
|
+
```
|
|
390
|
+
|
|
391
|
+
## Security Considerations
|
|
392
|
+
|
|
393
|
+
- **Authentication**: Add API keys or OAuth for production
|
|
394
|
+
- **Rate Limiting**: Implement at API gateway level
|
|
395
|
+
- **Input Validation**: URI format and allowlisting
|
|
396
|
+
- **HTTPS**: Use reverse proxy or Azure ingress
|
|
397
|
+
- **Resource Limits**: Configure memory and CPU limits
|
|
398
|
+
|
|
399
|
+
## License
|
|
400
|
+
|
|
401
|
+
This project is part of the Cortex project.
|
|
402
|
+
|
|
403
|
+
## Support
|
|
404
|
+
|
|
405
|
+
For issues or questions:
|
|
406
|
+
1. Check the logs: `docker compose logs -f`
|
|
407
|
+
2. Run tests: `./test_in_docker.sh`
|
|
408
|
+
3. Verify LibreOffice: `docker compose run --rm --entrypoint soffice doc-to-pdf --version`
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import subprocess
|
|
3
|
+
import logging
|
|
4
|
+
import time
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Optional, List
|
|
7
|
+
try:
|
|
8
|
+
import uno # type: ignore
|
|
9
|
+
from com.sun.star.beans import PropertyValue # type: ignore
|
|
10
|
+
except Exception:
|
|
11
|
+
uno = None
|
|
12
|
+
PropertyValue = None
|
|
13
|
+
|
|
14
|
+
class DocumentConverter:
|
|
15
|
+
"""
|
|
16
|
+
Optimized document converter using LibreOffice with maximum performance settings.
|
|
17
|
+
Conversions typically take 2-4 seconds depending on file size and complexity.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
SUPPORTED_FORMATS = {
|
|
21
|
+
'.doc', '.docx', '.docm', '.dot', '.dotx', '.dotm',
|
|
22
|
+
'.xls', '.xlsx', '.xlsm', '.xlt', '.xltx', '.xltm', '.csv',
|
|
23
|
+
'.ppt', '.pptx', '.pptm', '.pot', '.potx', '.potm', '.pps', '.ppsx', '.ppsm',
|
|
24
|
+
'.odt', '.ott', '.ods', '.ots', '.odp', '.otp', '.odg', '.otg', '.odf',
|
|
25
|
+
'.txt', '.rtf',
|
|
26
|
+
'.html', '.htm', '.xhtml',
|
|
27
|
+
'.xml', '.wpd', '.wps',
|
|
28
|
+
'.wk1', '.wks', '.123', '.dif', '.dbf',
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
def __init__(self, libreoffice_path: Optional[str] = None):
|
|
32
|
+
"""Initialize with optimized LibreOffice settings."""
|
|
33
|
+
self.libreoffice_path = libreoffice_path or self._find_libreoffice()
|
|
34
|
+
if not self.libreoffice_path:
|
|
35
|
+
raise RuntimeError("LibreOffice not found!")
|
|
36
|
+
|
|
37
|
+
logging.info(f"Using LibreOffice at: {self.libreoffice_path}")
|
|
38
|
+
|
|
39
|
+
def _find_libreoffice(self) -> Optional[str]:
|
|
40
|
+
"""Find LibreOffice installation."""
|
|
41
|
+
for cmd in ['soffice', 'libreoffice']:
|
|
42
|
+
try:
|
|
43
|
+
result = subprocess.run(
|
|
44
|
+
['which', cmd],
|
|
45
|
+
capture_output=True,
|
|
46
|
+
text=True,
|
|
47
|
+
timeout=5
|
|
48
|
+
)
|
|
49
|
+
if result.returncode == 0 and result.stdout.strip():
|
|
50
|
+
return result.stdout.strip()
|
|
51
|
+
except:
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
common_paths = [
|
|
55
|
+
'/usr/bin/libreoffice',
|
|
56
|
+
'/usr/bin/soffice',
|
|
57
|
+
'/Applications/LibreOffice.app/Contents/MacOS/soffice',
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
for path in common_paths:
|
|
61
|
+
if os.path.exists(path):
|
|
62
|
+
return path
|
|
63
|
+
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
def is_supported_format(self, file_extension: str) -> bool:
|
|
67
|
+
"""Check if file extension is supported."""
|
|
68
|
+
if not file_extension.startswith('.'):
|
|
69
|
+
file_extension = '.' + file_extension
|
|
70
|
+
return file_extension.lower() in self.SUPPORTED_FORMATS
|
|
71
|
+
|
|
72
|
+
def get_supported_formats(self) -> List[str]:
|
|
73
|
+
"""Get list of supported formats."""
|
|
74
|
+
return sorted(list(self.SUPPORTED_FORMATS))
|
|
75
|
+
|
|
76
|
+
def convert_to_pdf(
|
|
77
|
+
self,
|
|
78
|
+
input_file: str,
|
|
79
|
+
output_dir: Optional[str] = None,
|
|
80
|
+
timeout: int = 30
|
|
81
|
+
) -> Optional[str]:
|
|
82
|
+
"""
|
|
83
|
+
Convert document to PDF with maximum speed optimizations.
|
|
84
|
+
|
|
85
|
+
Performance: 2-4 seconds typical conversion time
|
|
86
|
+
- Small files (< 100KB): ~2.5s
|
|
87
|
+
- Medium files (100KB-1MB): ~3-4s
|
|
88
|
+
- Large files (> 1MB): ~3-4s
|
|
89
|
+
|
|
90
|
+
Note: LibreOffice has baseline processing overhead that cannot be eliminated.
|
|
91
|
+
This is already optimized with minimal flags and headless backend.
|
|
92
|
+
"""
|
|
93
|
+
if not os.path.exists(input_file):
|
|
94
|
+
raise FileNotFoundError(f"Input file not found: {input_file}")
|
|
95
|
+
|
|
96
|
+
input_path = Path(input_file)
|
|
97
|
+
if output_dir is None:
|
|
98
|
+
output_dir = str(input_path.parent)
|
|
99
|
+
|
|
100
|
+
pdf_filename = input_path.stem + ".pdf"
|
|
101
|
+
output_path = os.path.join(output_dir, pdf_filename)
|
|
102
|
+
|
|
103
|
+
start_time = time.time()
|
|
104
|
+
logging.info(f"Converting {input_file} to PDF...")
|
|
105
|
+
|
|
106
|
+
# Direct soffice path only
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
# Optimized LibreOffice command - minimal flags for maximum speed
|
|
110
|
+
cmd = [
|
|
111
|
+
self.libreoffice_path,
|
|
112
|
+
'--headless',
|
|
113
|
+
'--invisible',
|
|
114
|
+
'--nocrashreport',
|
|
115
|
+
'--nodefault',
|
|
116
|
+
'--nofirststartwizard',
|
|
117
|
+
'--nolockcheck',
|
|
118
|
+
'--nologo',
|
|
119
|
+
'--norestore',
|
|
120
|
+
'--convert-to', 'pdf:writer_pdf_Export',
|
|
121
|
+
'--outdir', output_dir,
|
|
122
|
+
input_file
|
|
123
|
+
]
|
|
124
|
+
|
|
125
|
+
env = {**os.environ, 'HOME': output_dir, 'SAL_USE_VCLPLUGIN': 'svp'}
|
|
126
|
+
|
|
127
|
+
result = subprocess.run(
|
|
128
|
+
cmd,
|
|
129
|
+
capture_output=True,
|
|
130
|
+
text=True,
|
|
131
|
+
timeout=timeout,
|
|
132
|
+
env=env,
|
|
133
|
+
stdin=subprocess.DEVNULL
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
elapsed = time.time() - start_time
|
|
137
|
+
|
|
138
|
+
if result.returncode != 0:
|
|
139
|
+
logging.error(f"Conversion failed ({elapsed:.2f}s): {result.stderr}")
|
|
140
|
+
return None
|
|
141
|
+
|
|
142
|
+
if not os.path.exists(output_path):
|
|
143
|
+
logging.error(f"PDF not created ({elapsed:.2f}s): {output_path}")
|
|
144
|
+
return None
|
|
145
|
+
|
|
146
|
+
file_size = os.path.getsize(output_path)
|
|
147
|
+
logging.info(f"✓ Converted in {elapsed:.2f}s ({file_size/1024:.1f}KB): {output_path}")
|
|
148
|
+
return output_path
|
|
149
|
+
|
|
150
|
+
except subprocess.TimeoutExpired:
|
|
151
|
+
elapsed = time.time() - start_time
|
|
152
|
+
logging.error(f"Conversion timed out after {elapsed:.2f}s")
|
|
153
|
+
return None
|
|
154
|
+
except Exception as e:
|
|
155
|
+
elapsed = time.time() - start_time
|
|
156
|
+
logging.error(f"Conversion error ({elapsed:.2f}s): {e}", exc_info=True)
|
|
157
|
+
return None
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
services:
|
|
2
|
+
doc-to-pdf:
|
|
3
|
+
build:
|
|
4
|
+
context: .
|
|
5
|
+
platform: linux/amd64 # Force AMD64 for Azure compatibility (works on Mac via emulation)
|
|
6
|
+
ports:
|
|
7
|
+
- "8080:8080"
|
|
8
|
+
environment:
|
|
9
|
+
- PORT=8080
|
|
10
|
+
|
|
11
|
+
tmpfs:
|
|
12
|
+
- /tmp:size=256m
|
|
13
|
+
volumes:
|
|
14
|
+
# Mount local code for development (optional)
|
|
15
|
+
- ./function_app.py:/home/site/wwwroot/function_app.py
|
|
16
|
+
- ./converter.py:/home/site/wwwroot/converter.py
|
|
17
|
+
restart: unless-stopped
|
|
18
|
+
healthcheck:
|
|
19
|
+
test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
|
|
20
|
+
interval: 30s
|
|
21
|
+
timeout: 10s
|
|
22
|
+
retries: 3
|
|
23
|
+
start_period: 40s
|