@aj-archipelago/cortex 1.3.50 → 1.3.52

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/config.js +1 -1
  2. package/helper-apps/cortex-browser/Dockerfile +19 -31
  3. package/helper-apps/cortex-browser/function_app.py +708 -181
  4. package/helper-apps/cortex-browser/requirements.txt +4 -4
  5. package/helper-apps/cortex-file-handler/{.env.test.azure → .env.test.azure.sample} +2 -1
  6. package/helper-apps/cortex-file-handler/{.env.test.gcs → .env.test.gcs.sample} +2 -1
  7. package/helper-apps/cortex-file-handler/{.env.test → .env.test.sample} +2 -1
  8. package/helper-apps/cortex-file-handler/Dockerfile +1 -1
  9. package/helper-apps/cortex-file-handler/INTERFACE.md +178 -0
  10. package/helper-apps/cortex-file-handler/function.json +2 -6
  11. package/helper-apps/cortex-file-handler/package-lock.json +6065 -5964
  12. package/helper-apps/cortex-file-handler/package.json +11 -6
  13. package/helper-apps/cortex-file-handler/scripts/setup-azure-container.js +12 -9
  14. package/helper-apps/cortex-file-handler/scripts/setup-test-containers.js +21 -18
  15. package/helper-apps/cortex-file-handler/scripts/test-azure.sh +4 -1
  16. package/helper-apps/cortex-file-handler/scripts/test-gcs.sh +1 -1
  17. package/helper-apps/cortex-file-handler/src/blobHandler.js +1056 -0
  18. package/helper-apps/cortex-file-handler/{constants.js → src/constants.js} +64 -48
  19. package/helper-apps/cortex-file-handler/src/docHelper.js +37 -0
  20. package/helper-apps/cortex-file-handler/{fileChunker.js → src/fileChunker.js} +97 -65
  21. package/helper-apps/cortex-file-handler/{helper.js → src/helper.js} +34 -25
  22. package/helper-apps/cortex-file-handler/src/index.js +608 -0
  23. package/helper-apps/cortex-file-handler/src/localFileHandler.js +107 -0
  24. package/helper-apps/cortex-file-handler/{redis.js → src/redis.js} +23 -17
  25. package/helper-apps/cortex-file-handler/src/services/ConversionService.js +309 -0
  26. package/helper-apps/cortex-file-handler/src/services/FileConversionService.js +57 -0
  27. package/helper-apps/cortex-file-handler/src/services/storage/AzureStorageProvider.js +177 -0
  28. package/helper-apps/cortex-file-handler/src/services/storage/GCSStorageProvider.js +258 -0
  29. package/helper-apps/cortex-file-handler/src/services/storage/LocalStorageProvider.js +182 -0
  30. package/helper-apps/cortex-file-handler/src/services/storage/StorageFactory.js +86 -0
  31. package/helper-apps/cortex-file-handler/src/services/storage/StorageProvider.js +53 -0
  32. package/helper-apps/cortex-file-handler/src/services/storage/StorageService.js +259 -0
  33. package/helper-apps/cortex-file-handler/src/start.js +88 -0
  34. package/helper-apps/cortex-file-handler/src/utils/filenameUtils.js +28 -0
  35. package/helper-apps/cortex-file-handler/tests/FileConversionService.test.js +144 -0
  36. package/helper-apps/cortex-file-handler/tests/blobHandler.test.js +90 -66
  37. package/helper-apps/cortex-file-handler/tests/conversionResilience.test.js +152 -0
  38. package/helper-apps/cortex-file-handler/tests/fileChunker.test.js +105 -108
  39. package/helper-apps/cortex-file-handler/tests/fileUpload.test.js +462 -0
  40. package/helper-apps/cortex-file-handler/tests/files/DOCX_TestPage.docx +0 -0
  41. package/helper-apps/cortex-file-handler/tests/files/tests-example.xls +0 -0
  42. package/helper-apps/cortex-file-handler/tests/getOperations.test.js +307 -0
  43. package/helper-apps/cortex-file-handler/tests/postOperations.test.js +291 -0
  44. package/helper-apps/cortex-file-handler/tests/start.test.js +984 -647
  45. package/helper-apps/cortex-file-handler/tests/storage/AzureStorageProvider.test.js +120 -0
  46. package/helper-apps/cortex-file-handler/tests/storage/GCSStorageProvider.test.js +193 -0
  47. package/helper-apps/cortex-file-handler/tests/storage/LocalStorageProvider.test.js +148 -0
  48. package/helper-apps/cortex-file-handler/tests/storage/StorageFactory.test.js +100 -0
  49. package/helper-apps/cortex-file-handler/tests/storage/StorageService.test.js +113 -0
  50. package/helper-apps/cortex-file-handler/tests/testUtils.helper.js +85 -0
  51. package/helper-apps/cortex-markitdown/.funcignore +1 -0
  52. package/helper-apps/cortex-markitdown/MarkitdownConverterFunction/__init__.py +64 -0
  53. package/helper-apps/cortex-markitdown/MarkitdownConverterFunction/function.json +21 -0
  54. package/helper-apps/cortex-markitdown/README.md +94 -0
  55. package/helper-apps/cortex-markitdown/host.json +15 -0
  56. package/helper-apps/cortex-markitdown/requirements.txt +2 -0
  57. package/lib/entityConstants.js +1 -1
  58. package/lib/requestExecutor.js +44 -36
  59. package/package.json +1 -1
  60. package/pathways/system/entity/tools/sys_tool_readfile.js +24 -2
  61. package/server/plugins/openAiWhisperPlugin.js +59 -87
  62. package/helper-apps/cortex-file-handler/blobHandler.js +0 -567
  63. package/helper-apps/cortex-file-handler/docHelper.js +0 -144
  64. package/helper-apps/cortex-file-handler/index.js +0 -440
  65. package/helper-apps/cortex-file-handler/localFileHandler.js +0 -108
  66. package/helper-apps/cortex-file-handler/start.js +0 -63
  67. package/helper-apps/cortex-file-handler/tests/docHelper.test.js +0 -148
@@ -4,14 +4,13 @@ certifi==2025.4.26
4
4
  charset-normalizer==3.4.2
5
5
  courlan==1.3.2
6
6
  dateparser==1.2.1
7
- greenlet==3.2.1
7
+ greenlet==3.0.3
8
8
  htmldate==1.9.3
9
9
  jusText==3.0.2
10
10
  lxml==5.4.0
11
11
  lxml_html_clean==0.4.2
12
12
  MarkupSafe==3.0.2
13
- playwright==1.52.0
14
- pyee==13.0.0
13
+ pyee==11.1.0
15
14
  python-dateutil==2.9.0.post0
16
15
  pytz==2025.2
17
16
  regex==2024.11.6
@@ -20,5 +19,6 @@ tld==0.13
20
19
  trafilatura==2.0.0
21
20
  typing_extensions==4.13.2
22
21
  tzlocal==5.3.1
23
- urllib3==2.4.0
24
22
  Werkzeug==3.1.3
23
+ playwright==1.45.0
24
+ aiohttp
@@ -3,4 +3,5 @@ REDIS_CONNECTION_STRING=redis://default:redispw@localhost:32768
3
3
  AZURE_STORAGE_CONNECTION_STRING=UseDevelopmentStorage=true
4
4
  AZURE_STORAGE_CONTAINER_NAME=test-container
5
5
  NODE_ENV=test
6
- PORT=7072 # Different port for testing
6
+ PORT=7072 # Different port for testing
7
+ MARKITDOWN_CONVERT_URL= #cortex-markitdown url
@@ -6,4 +6,5 @@ GCS_BUCKETNAME=cortextempfiles
6
6
  AZURE_STORAGE_CONNECTION_STRING=UseDevelopmentStorage=true
7
7
  AZURE_STORAGE_CONTAINER_NAME=test-container
8
8
  NODE_ENV=test
9
- PORT=7072 # Different port for testing
9
+ PORT=7072 # Different port for testing
10
+ MARKITDOWN_CONVERT_URL= #cortex-markitdown url
@@ -4,4 +4,5 @@ REDIS_CONNECTION_STRING=redis://default:redispw@localhost:32768
4
4
  AZURE_STORAGE_CONTAINER_NAME=test-container
5
5
  #GCP_SERVICE_ACCOUNT_KEY={"type":"service_account","project_id":"test-project"}
6
6
  NODE_ENV=test
7
- PORT=7072 # Different port for testing
7
+ PORT=7072 # Different port for testing
8
+ MARKITDOWN_CONVERT_URL= #cortex-markitdown url
@@ -16,4 +16,4 @@ EXPOSE 7071
16
16
 
17
17
  # RUN npm run build
18
18
 
19
- CMD [ "node", "start.js" ]
19
+ CMD [ "npm", "start" ]
@@ -0,0 +1,178 @@
1
+ # Cortex File Handler Interface Documentation
2
+
3
+ ## Overview
4
+ The Cortex File Handler is a service that processes files through various operations including uploading, downloading, chunking, and document processing. It supports multiple storage backends (Azure Blob Storage, Google Cloud Storage, and Local File System).
5
+
6
+ ## Request Methods
7
+
8
+ ### POST
9
+ - **Purpose**: Upload a file
10
+ - **Content-Type**: `multipart/form-data`
11
+ - **Parameters**:
12
+ - `hash` (optional): Unique identifier for the file
13
+ - `requestId` (required): Unique identifier for the request
14
+ - File content must be included in the form data
15
+ - **Behavior**:
16
+ - Uploads file to primary storage (Azure or Local)
17
+ - If GCS is configured, also uploads to GCS
18
+ - If hash is provided, stores file metadata in Redis
19
+ - Returns upload result with file URLs
20
+ - **Response**: Object containing:
21
+ - `url`: Primary storage URL
22
+ - `gcs`: GCS URL (if GCS is configured)
23
+ - `hash`: Hash value (if provided)
24
+ - `message`: Success message
25
+ - `filename`: Original filename
26
+ - **Note**: The `save` parameter is not supported in POST requests. To convert and save a document as text, use GET with the `save` parameter.
27
+
28
+ ### GET
29
+ - **Purpose**: Process or retrieve files
30
+ - **Parameters** (can be in query string or request body):
31
+ - `uri` (required if not using fetch/load/restore): URL of the file to process
32
+ - Requires `requestId` parameter
33
+ - No Redis caching
34
+ - Direct processing based on file type
35
+ - `requestId` (required with `uri`): Unique identifier for the request
36
+ - `save` (optional): If true, saves document as text file
37
+ - When true, converts document to text and saves to primary storage only (Azure or Local)
38
+ - Does not save to GCS
39
+ - Original document is deleted from storage after text conversion
40
+ - `hash` (optional): Unique identifier for the file
41
+ - `checkHash` (optional): Check if hash exists
42
+ - `clearHash` (optional): Remove hash from storage
43
+ - `fetch`/`load`/`restore` (optional): URL to fetch remote file (these are aliases - any of the three parameters will trigger the same remote file processing behavior)
44
+ - Does not require `requestId`
45
+ - Uses Redis caching
46
+ - Downloads and validates file first
47
+ - Ensures correct file extension
48
+ - Truncates long filenames
49
+ - **Behavior**:
50
+ - For documents (PDF, DOC, etc.):
51
+ - If `save=true`:
52
+ - Converts document to text
53
+ - Saves text file to primary storage (Azure or Local)
54
+ - Deletes original document from storage
55
+ - Does not save to GCS
56
+ - Returns object with primary storage URL
57
+ - If `save=false`:
58
+ - Converts document to text
59
+ - Returns array of text chunks
60
+ - Does not persist any files
61
+ - For media files:
62
+ - Splits into chunks
63
+ - Uploads chunks to primary storage and GCS (if configured)
64
+ - Returns chunk information with offsets
65
+ - For remote files (`fetch`/`load`/`restore`):
66
+ - Downloads file from URL
67
+ - Processes based on file type
68
+ - Returns processed result
69
+ - Caches result in Redis using URL as key
70
+ - Updates Redis timestamp on subsequent requests
71
+ - Truncates filenames longer than 200 characters
72
+ - Ensures correct file extension based on content type
73
+
74
+ ### DELETE
75
+ - **Purpose**: Remove files from storage
76
+ - **Parameters** (can be in query string or request body):
77
+ - `requestId` (required): Unique identifier for the request
78
+ - **Behavior**:
79
+ - Deletes file from primary storage (Azure or Local)
80
+ - Deletes file from GCS if configured
81
+ - Returns deletion result
82
+ - **Response**: Array of deleted file URLs
83
+
84
+ ## Storage Configuration
85
+ - **Azure**: Enabled if `AZURE_STORAGE_CONNECTION_STRING` is set
86
+ - **GCS**: Enabled if `GCP_SERVICE_ACCOUNT_KEY_BASE64` or `GCP_SERVICE_ACCOUNT_KEY` is set
87
+ - **Local**: Used as fallback if Azure is not configured
88
+
89
+ ## Response Format
90
+ - **Success**:
91
+ - Status: 200
92
+ - Body: Varies by operation (see specific methods above)
93
+ - **Error**:
94
+ - Status: 400/404/500
95
+ - Body: Error message string
96
+
97
+ ## Progress Tracking
98
+ - Progress updates are published to Redis for each operation
99
+ - Progress includes:
100
+ - `progress`: Completion percentage (0-1)
101
+ - `completedCount`: Number of completed steps
102
+ - `totalCount`: Total number of steps
103
+ - `numberOfChunks`: Number of chunks (for media files)
104
+ - `data`: Additional operation data
105
+ - Progress updates are published to Redis channel associated with `requestId`
106
+
107
+ ## File Types
108
+ - **Documents**: Processed based on `DOC_EXTENSIONS` list
109
+ - Supported extensions:
110
+ - Text: .txt, .json, .csv, .md, .xml, .js, .html, .css
111
+ - Office: .doc, .docx, .xls, .xlsx
112
+ - Document processing limitations:
113
+ - PDFs: Does not support scanned, encrypted, or password-protected PDFs
114
+ - Requires OCR for PDFs without embedded fonts
115
+ - Text chunking:
116
+ - Maximum chunk size: 10,000 characters
117
+ - Chunks are split at sentence boundaries when possible
118
+ - Returns array of text chunks
119
+ - **Media**: All other file types, processed through chunking
120
+ - Chunked into smaller pieces for processing
121
+ - Each chunk is stored separately
122
+ - Media chunking behavior:
123
+ - Default chunk duration: 500 seconds
124
+ - Chunks are processed in parallel (3 at a time)
125
+ - Audio is converted to MP3 format (128kbps)
126
+ - Uses 4MB read buffer for file processing
127
+ - Supported media types:
128
+ - Images: .jpg, .jpeg, .png, .webp, .heic, .heif, .pdf
129
+ - Video: .mp4, .mpeg, .mov, .avi, .flv, .mpg, .webm, .wmv, .3gp
130
+ - Audio: .wav, .mp3, .aac, .ogg, .flac, .m4a
131
+ - File download behavior:
132
+ - 30 second timeout for downloads
133
+ - Supports streaming downloads
134
+ - Handles URL encoding/decoding
135
+ - Truncates filenames longer than 200 characters
136
+
137
+ ## Storage Behavior
138
+ - **Primary Storage** (Azure or Local):
139
+ - Files are stored with UUID-based names
140
+ - Organized by requestId folders
141
+ - Azure: Uses SAS tokens for access
142
+ - Local: Served via HTTP on configured port
143
+ - **GCS** (if configured):
144
+ - Files stored with gs:// protocol URLs
145
+ - Same folder structure as primary storage
146
+ - Only used for media file chunks
147
+ - **Redis**:
148
+ - Stores file metadata and URLs
149
+ - Used for caching remote file results
150
+ - Tracks file access timestamps
151
+ - Used for progress tracking
152
+
153
+ ## Cleanup
154
+ - Automatic cleanup of inactive files
155
+ - Removes files from:
156
+ - Primary storage (Azure/Local)
157
+ - GCS (if configured)
158
+ - Redis file store map
159
+ - Cleanup is triggered on each request but only runs if not already in progress
160
+ - Temporary files are cleaned up:
161
+ - After 1 hour of inactivity
162
+ - After successful processing
163
+ - On error conditions
164
+
165
+ ## Error Handling
166
+ - **400 Bad Request**:
167
+ - Missing required parameters
168
+ - Invalid or inaccessible URL
169
+ - Unsupported file type
170
+ - **404 Not Found**:
171
+ - File or hash not found
172
+ - File not found in storage
173
+ - **500 Internal Server Error**:
174
+ - Processing errors
175
+ - Storage errors
176
+ - Document conversion errors
177
+ - PDF processing errors (scanned, encrypted, password-protected)
178
+ - All errors include descriptive message in response body
@@ -5,11 +5,7 @@
5
5
  "type": "httpTrigger",
6
6
  "direction": "in",
7
7
  "name": "req",
8
- "methods": [
9
- "get",
10
- "post",
11
- "delete"
12
- ]
8
+ "methods": ["get", "post", "delete"]
13
9
  },
14
10
  {
15
11
  "type": "http",
@@ -17,4 +13,4 @@
17
13
  "name": "res"
18
14
  }
19
15
  ]
20
- }
16
+ }