@aj-archipelago/cortex 1.3.50 → 1.3.52
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config.js +1 -1
- package/helper-apps/cortex-browser/Dockerfile +19 -31
- package/helper-apps/cortex-browser/function_app.py +708 -181
- package/helper-apps/cortex-browser/requirements.txt +4 -4
- package/helper-apps/cortex-file-handler/{.env.test.azure → .env.test.azure.sample} +2 -1
- package/helper-apps/cortex-file-handler/{.env.test.gcs → .env.test.gcs.sample} +2 -1
- package/helper-apps/cortex-file-handler/{.env.test → .env.test.sample} +2 -1
- package/helper-apps/cortex-file-handler/Dockerfile +1 -1
- package/helper-apps/cortex-file-handler/INTERFACE.md +178 -0
- package/helper-apps/cortex-file-handler/function.json +2 -6
- package/helper-apps/cortex-file-handler/package-lock.json +6065 -5964
- package/helper-apps/cortex-file-handler/package.json +11 -6
- package/helper-apps/cortex-file-handler/scripts/setup-azure-container.js +12 -9
- package/helper-apps/cortex-file-handler/scripts/setup-test-containers.js +21 -18
- package/helper-apps/cortex-file-handler/scripts/test-azure.sh +4 -1
- package/helper-apps/cortex-file-handler/scripts/test-gcs.sh +1 -1
- package/helper-apps/cortex-file-handler/src/blobHandler.js +1056 -0
- package/helper-apps/cortex-file-handler/{constants.js → src/constants.js} +64 -48
- package/helper-apps/cortex-file-handler/src/docHelper.js +37 -0
- package/helper-apps/cortex-file-handler/{fileChunker.js → src/fileChunker.js} +97 -65
- package/helper-apps/cortex-file-handler/{helper.js → src/helper.js} +34 -25
- package/helper-apps/cortex-file-handler/src/index.js +608 -0
- package/helper-apps/cortex-file-handler/src/localFileHandler.js +107 -0
- package/helper-apps/cortex-file-handler/{redis.js → src/redis.js} +23 -17
- package/helper-apps/cortex-file-handler/src/services/ConversionService.js +309 -0
- package/helper-apps/cortex-file-handler/src/services/FileConversionService.js +57 -0
- package/helper-apps/cortex-file-handler/src/services/storage/AzureStorageProvider.js +177 -0
- package/helper-apps/cortex-file-handler/src/services/storage/GCSStorageProvider.js +258 -0
- package/helper-apps/cortex-file-handler/src/services/storage/LocalStorageProvider.js +182 -0
- package/helper-apps/cortex-file-handler/src/services/storage/StorageFactory.js +86 -0
- package/helper-apps/cortex-file-handler/src/services/storage/StorageProvider.js +53 -0
- package/helper-apps/cortex-file-handler/src/services/storage/StorageService.js +259 -0
- package/helper-apps/cortex-file-handler/src/start.js +88 -0
- package/helper-apps/cortex-file-handler/src/utils/filenameUtils.js +28 -0
- package/helper-apps/cortex-file-handler/tests/FileConversionService.test.js +144 -0
- package/helper-apps/cortex-file-handler/tests/blobHandler.test.js +90 -66
- package/helper-apps/cortex-file-handler/tests/conversionResilience.test.js +152 -0
- package/helper-apps/cortex-file-handler/tests/fileChunker.test.js +105 -108
- package/helper-apps/cortex-file-handler/tests/fileUpload.test.js +462 -0
- package/helper-apps/cortex-file-handler/tests/files/DOCX_TestPage.docx +0 -0
- package/helper-apps/cortex-file-handler/tests/files/tests-example.xls +0 -0
- package/helper-apps/cortex-file-handler/tests/getOperations.test.js +307 -0
- package/helper-apps/cortex-file-handler/tests/postOperations.test.js +291 -0
- package/helper-apps/cortex-file-handler/tests/start.test.js +984 -647
- package/helper-apps/cortex-file-handler/tests/storage/AzureStorageProvider.test.js +120 -0
- package/helper-apps/cortex-file-handler/tests/storage/GCSStorageProvider.test.js +193 -0
- package/helper-apps/cortex-file-handler/tests/storage/LocalStorageProvider.test.js +148 -0
- package/helper-apps/cortex-file-handler/tests/storage/StorageFactory.test.js +100 -0
- package/helper-apps/cortex-file-handler/tests/storage/StorageService.test.js +113 -0
- package/helper-apps/cortex-file-handler/tests/testUtils.helper.js +85 -0
- package/helper-apps/cortex-markitdown/.funcignore +1 -0
- package/helper-apps/cortex-markitdown/MarkitdownConverterFunction/__init__.py +64 -0
- package/helper-apps/cortex-markitdown/MarkitdownConverterFunction/function.json +21 -0
- package/helper-apps/cortex-markitdown/README.md +94 -0
- package/helper-apps/cortex-markitdown/host.json +15 -0
- package/helper-apps/cortex-markitdown/requirements.txt +2 -0
- package/lib/entityConstants.js +1 -1
- package/lib/requestExecutor.js +44 -36
- package/package.json +1 -1
- package/pathways/system/entity/tools/sys_tool_readfile.js +24 -2
- package/server/plugins/openAiWhisperPlugin.js +59 -87
- package/helper-apps/cortex-file-handler/blobHandler.js +0 -567
- package/helper-apps/cortex-file-handler/docHelper.js +0 -144
- package/helper-apps/cortex-file-handler/index.js +0 -440
- package/helper-apps/cortex-file-handler/localFileHandler.js +0 -108
- package/helper-apps/cortex-file-handler/start.js +0 -63
- package/helper-apps/cortex-file-handler/tests/docHelper.test.js +0 -148
|
@@ -4,14 +4,13 @@ certifi==2025.4.26
|
|
|
4
4
|
charset-normalizer==3.4.2
|
|
5
5
|
courlan==1.3.2
|
|
6
6
|
dateparser==1.2.1
|
|
7
|
-
greenlet==3.
|
|
7
|
+
greenlet==3.0.3
|
|
8
8
|
htmldate==1.9.3
|
|
9
9
|
jusText==3.0.2
|
|
10
10
|
lxml==5.4.0
|
|
11
11
|
lxml_html_clean==0.4.2
|
|
12
12
|
MarkupSafe==3.0.2
|
|
13
|
-
|
|
14
|
-
pyee==13.0.0
|
|
13
|
+
pyee==11.1.0
|
|
15
14
|
python-dateutil==2.9.0.post0
|
|
16
15
|
pytz==2025.2
|
|
17
16
|
regex==2024.11.6
|
|
@@ -20,5 +19,6 @@ tld==0.13
|
|
|
20
19
|
trafilatura==2.0.0
|
|
21
20
|
typing_extensions==4.13.2
|
|
22
21
|
tzlocal==5.3.1
|
|
23
|
-
urllib3==2.4.0
|
|
24
22
|
Werkzeug==3.1.3
|
|
23
|
+
playwright==1.45.0
|
|
24
|
+
aiohttp
|
|
@@ -3,4 +3,5 @@ REDIS_CONNECTION_STRING=redis://default:redispw@localhost:32768
|
|
|
3
3
|
AZURE_STORAGE_CONNECTION_STRING=UseDevelopmentStorage=true
|
|
4
4
|
AZURE_STORAGE_CONTAINER_NAME=test-container
|
|
5
5
|
NODE_ENV=test
|
|
6
|
-
PORT=7072 # Different port for testing
|
|
6
|
+
PORT=7072 # Different port for testing
|
|
7
|
+
MARKITDOWN_CONVERT_URL= #cortex-markitdown url
|
|
@@ -6,4 +6,5 @@ GCS_BUCKETNAME=cortextempfiles
|
|
|
6
6
|
AZURE_STORAGE_CONNECTION_STRING=UseDevelopmentStorage=true
|
|
7
7
|
AZURE_STORAGE_CONTAINER_NAME=test-container
|
|
8
8
|
NODE_ENV=test
|
|
9
|
-
PORT=7072 # Different port for testing
|
|
9
|
+
PORT=7072 # Different port for testing
|
|
10
|
+
MARKITDOWN_CONVERT_URL= #cortex-markitdown url
|
|
@@ -4,4 +4,5 @@ REDIS_CONNECTION_STRING=redis://default:redispw@localhost:32768
|
|
|
4
4
|
AZURE_STORAGE_CONTAINER_NAME=test-container
|
|
5
5
|
#GCP_SERVICE_ACCOUNT_KEY={"type":"service_account","project_id":"test-project"}
|
|
6
6
|
NODE_ENV=test
|
|
7
|
-
PORT=7072 # Different port for testing
|
|
7
|
+
PORT=7072 # Different port for testing
|
|
8
|
+
MARKITDOWN_CONVERT_URL= #cortex-markitdown url
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
# Cortex File Handler Interface Documentation
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
The Cortex File Handler is a service that processes files through various operations including uploading, downloading, chunking, and document processing. It supports multiple storage backends (Azure Blob Storage, Google Cloud Storage, and Local File System).
|
|
5
|
+
|
|
6
|
+
## Request Methods
|
|
7
|
+
|
|
8
|
+
### POST
|
|
9
|
+
- **Purpose**: Upload a file
|
|
10
|
+
- **Content-Type**: `multipart/form-data`
|
|
11
|
+
- **Parameters**:
|
|
12
|
+
- `hash` (optional): Unique identifier for the file
|
|
13
|
+
- `requestId` (required): Unique identifier for the request
|
|
14
|
+
- File content must be included in the form data
|
|
15
|
+
- **Behavior**:
|
|
16
|
+
- Uploads file to primary storage (Azure or Local)
|
|
17
|
+
- If GCS is configured, also uploads to GCS
|
|
18
|
+
- If hash is provided, stores file metadata in Redis
|
|
19
|
+
- Returns upload result with file URLs
|
|
20
|
+
- **Response**: Object containing:
|
|
21
|
+
- `url`: Primary storage URL
|
|
22
|
+
- `gcs`: GCS URL (if GCS is configured)
|
|
23
|
+
- `hash`: Hash value (if provided)
|
|
24
|
+
- `message`: Success message
|
|
25
|
+
- `filename`: Original filename
|
|
26
|
+
- **Note**: The `save` parameter is not supported in POST requests. To convert and save a document as text, use GET with the `save` parameter.
|
|
27
|
+
|
|
28
|
+
### GET
|
|
29
|
+
- **Purpose**: Process or retrieve files
|
|
30
|
+
- **Parameters** (can be in query string or request body):
|
|
31
|
+
- `uri` (required if not using fetch/load/restore): URL of the file to process
|
|
32
|
+
- Requires `requestId` parameter
|
|
33
|
+
- No Redis caching
|
|
34
|
+
- Direct processing based on file type
|
|
35
|
+
- `requestId` (required with `uri`): Unique identifier for the request
|
|
36
|
+
- `save` (optional): If true, saves document as text file
|
|
37
|
+
- When true, converts document to text and saves to primary storage only (Azure or Local)
|
|
38
|
+
- Does not save to GCS
|
|
39
|
+
- Original document is deleted from storage after text conversion
|
|
40
|
+
- `hash` (optional): Unique identifier for the file
|
|
41
|
+
- `checkHash` (optional): Check if hash exists
|
|
42
|
+
- `clearHash` (optional): Remove hash from storage
|
|
43
|
+
- `fetch`/`load`/`restore` (optional): URL to fetch remote file (these are aliases - any of the three parameters will trigger the same remote file processing behavior)
|
|
44
|
+
- Does not require `requestId`
|
|
45
|
+
- Uses Redis caching
|
|
46
|
+
- Downloads and validates file first
|
|
47
|
+
- Ensures correct file extension
|
|
48
|
+
- Truncates long filenames
|
|
49
|
+
- **Behavior**:
|
|
50
|
+
- For documents (PDF, DOC, etc.):
|
|
51
|
+
- If `save=true`:
|
|
52
|
+
- Converts document to text
|
|
53
|
+
- Saves text file to primary storage (Azure or Local)
|
|
54
|
+
- Deletes original document from storage
|
|
55
|
+
- Does not save to GCS
|
|
56
|
+
- Returns object with primary storage URL
|
|
57
|
+
- If `save=false`:
|
|
58
|
+
- Converts document to text
|
|
59
|
+
- Returns array of text chunks
|
|
60
|
+
- Does not persist any files
|
|
61
|
+
- For media files:
|
|
62
|
+
- Splits into chunks
|
|
63
|
+
- Uploads chunks to primary storage and GCS (if configured)
|
|
64
|
+
- Returns chunk information with offsets
|
|
65
|
+
- For remote files (`fetch`/`load`/`restore`):
|
|
66
|
+
- Downloads file from URL
|
|
67
|
+
- Processes based on file type
|
|
68
|
+
- Returns processed result
|
|
69
|
+
- Caches result in Redis using URL as key
|
|
70
|
+
- Updates Redis timestamp on subsequent requests
|
|
71
|
+
- Truncates filenames longer than 200 characters
|
|
72
|
+
- Ensures correct file extension based on content type
|
|
73
|
+
|
|
74
|
+
### DELETE
|
|
75
|
+
- **Purpose**: Remove files from storage
|
|
76
|
+
- **Parameters** (can be in query string or request body):
|
|
77
|
+
- `requestId` (required): Unique identifier for the request
|
|
78
|
+
- **Behavior**:
|
|
79
|
+
- Deletes file from primary storage (Azure or Local)
|
|
80
|
+
- Deletes file from GCS if configured
|
|
81
|
+
- Returns deletion result
|
|
82
|
+
- **Response**: Array of deleted file URLs
|
|
83
|
+
|
|
84
|
+
## Storage Configuration
|
|
85
|
+
- **Azure**: Enabled if `AZURE_STORAGE_CONNECTION_STRING` is set
|
|
86
|
+
- **GCS**: Enabled if `GCP_SERVICE_ACCOUNT_KEY_BASE64` or `GCP_SERVICE_ACCOUNT_KEY` is set
|
|
87
|
+
- **Local**: Used as fallback if Azure is not configured
|
|
88
|
+
|
|
89
|
+
## Response Format
|
|
90
|
+
- **Success**:
|
|
91
|
+
- Status: 200
|
|
92
|
+
- Body: Varies by operation (see specific methods above)
|
|
93
|
+
- **Error**:
|
|
94
|
+
- Status: 400/404/500
|
|
95
|
+
- Body: Error message string
|
|
96
|
+
|
|
97
|
+
## Progress Tracking
|
|
98
|
+
- Progress updates are published to Redis for each operation
|
|
99
|
+
- Progress includes:
|
|
100
|
+
- `progress`: Completion percentage (0-1)
|
|
101
|
+
- `completedCount`: Number of completed steps
|
|
102
|
+
- `totalCount`: Total number of steps
|
|
103
|
+
- `numberOfChunks`: Number of chunks (for media files)
|
|
104
|
+
- `data`: Additional operation data
|
|
105
|
+
- Progress updates are published to Redis channel associated with `requestId`
|
|
106
|
+
|
|
107
|
+
## File Types
|
|
108
|
+
- **Documents**: Processed based on `DOC_EXTENSIONS` list
|
|
109
|
+
- Supported extensions:
|
|
110
|
+
- Text: .txt, .json, .csv, .md, .xml, .js, .html, .css
|
|
111
|
+
- Office: .doc, .docx, .xls, .xlsx
|
|
112
|
+
- Document processing limitations:
|
|
113
|
+
- PDFs: Does not support scanned, encrypted, or password-protected PDFs
|
|
114
|
+
- Requires OCR for PDFs without embedded fonts
|
|
115
|
+
- Text chunking:
|
|
116
|
+
- Maximum chunk size: 10,000 characters
|
|
117
|
+
- Chunks are split at sentence boundaries when possible
|
|
118
|
+
- Returns array of text chunks
|
|
119
|
+
- **Media**: All other file types, processed through chunking
|
|
120
|
+
- Chunked into smaller pieces for processing
|
|
121
|
+
- Each chunk is stored separately
|
|
122
|
+
- Media chunking behavior:
|
|
123
|
+
- Default chunk duration: 500 seconds
|
|
124
|
+
- Chunks are processed in parallel (3 at a time)
|
|
125
|
+
- Audio is converted to MP3 format (128kbps)
|
|
126
|
+
- Uses 4MB read buffer for file processing
|
|
127
|
+
- Supported media types:
|
|
128
|
+
- Images: .jpg, .jpeg, .png, .webp, .heic, .heif, .pdf
|
|
129
|
+
- Video: .mp4, .mpeg, .mov, .avi, .flv, .mpg, .webm, .wmv, .3gp
|
|
130
|
+
- Audio: .wav, .mp3, .aac, .ogg, .flac, .m4a
|
|
131
|
+
- File download behavior:
|
|
132
|
+
- 30 second timeout for downloads
|
|
133
|
+
- Supports streaming downloads
|
|
134
|
+
- Handles URL encoding/decoding
|
|
135
|
+
- Truncates filenames longer than 200 characters
|
|
136
|
+
|
|
137
|
+
## Storage Behavior
|
|
138
|
+
- **Primary Storage** (Azure or Local):
|
|
139
|
+
- Files are stored with UUID-based names
|
|
140
|
+
- Organized by requestId folders
|
|
141
|
+
- Azure: Uses SAS tokens for access
|
|
142
|
+
- Local: Served via HTTP on configured port
|
|
143
|
+
- **GCS** (if configured):
|
|
144
|
+
- Files stored with gs:// protocol URLs
|
|
145
|
+
- Same folder structure as primary storage
|
|
146
|
+
- Only used for media file chunks
|
|
147
|
+
- **Redis**:
|
|
148
|
+
- Stores file metadata and URLs
|
|
149
|
+
- Used for caching remote file results
|
|
150
|
+
- Tracks file access timestamps
|
|
151
|
+
- Used for progress tracking
|
|
152
|
+
|
|
153
|
+
## Cleanup
|
|
154
|
+
- Automatic cleanup of inactive files
|
|
155
|
+
- Removes files from:
|
|
156
|
+
- Primary storage (Azure/Local)
|
|
157
|
+
- GCS (if configured)
|
|
158
|
+
- Redis file store map
|
|
159
|
+
- Cleanup is triggered on each request but only runs if not already in progress
|
|
160
|
+
- Temporary files are cleaned up:
|
|
161
|
+
- After 1 hour of inactivity
|
|
162
|
+
- After successful processing
|
|
163
|
+
- On error conditions
|
|
164
|
+
|
|
165
|
+
## Error Handling
|
|
166
|
+
- **400 Bad Request**:
|
|
167
|
+
- Missing required parameters
|
|
168
|
+
- Invalid or inaccessible URL
|
|
169
|
+
- Unsupported file type
|
|
170
|
+
- **404 Not Found**:
|
|
171
|
+
- File or hash not found
|
|
172
|
+
- File not found in storage
|
|
173
|
+
- **500 Internal Server Error**:
|
|
174
|
+
- Processing errors
|
|
175
|
+
- Storage errors
|
|
176
|
+
- Document conversion errors
|
|
177
|
+
- PDF processing errors (scanned, encrypted, password-protected)
|
|
178
|
+
- All errors include descriptive message in response body
|