llama-cloud 0.1.35__py3-none-any.whl → 0.1.36__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of llama-cloud might be problematic. Click here for more details.
- llama_cloud/__init__.py +20 -2
- llama_cloud/resources/admin/client.py +51 -0
- llama_cloud/resources/classifier/client.py +231 -181
- llama_cloud/resources/data_sinks/types/data_sink_update_component.py +2 -0
- llama_cloud/resources/llama_extract/client.py +96 -4
- llama_cloud/types/__init__.py +20 -2
- llama_cloud/types/classification_result.py +4 -5
- llama_cloud/types/classifier_rule.py +43 -0
- llama_cloud/types/classify_job.py +45 -0
- llama_cloud/types/{classify_response.py → classify_job_results.py} +3 -6
- llama_cloud/types/classify_job_with_status.py +47 -0
- llama_cloud/types/classify_parsing_configuration.py +38 -0
- llama_cloud/types/cloud_astra_db_vector_store.py +51 -0
- llama_cloud/types/cloud_confluence_data_source.py +15 -0
- llama_cloud/types/configurable_data_sink_names.py +4 -0
- llama_cloud/types/data_sink_component.py +2 -0
- llama_cloud/types/data_sink_create_component.py +2 -0
- llama_cloud/types/failure_handling_config.py +37 -0
- llama_cloud/types/file_classification.py +41 -0
- llama_cloud/types/file_store_info_response.py +34 -0
- llama_cloud/types/file_store_info_response_status.py +25 -0
- llama_cloud/types/supported_llm_model_names.py +12 -0
- {llama_cloud-0.1.35.dist-info → llama_cloud-0.1.36.dist-info}/METADATA +2 -4
- {llama_cloud-0.1.35.dist-info → llama_cloud-0.1.36.dist-info}/RECORD +26 -17
- {llama_cloud-0.1.35.dist-info → llama_cloud-0.1.36.dist-info}/WHEEL +1 -1
- {llama_cloud-0.1.35.dist-info → llama_cloud-0.1.36.dist-info}/LICENSE +0 -0
llama_cloud/__init__.py
CHANGED
|
@@ -38,7 +38,12 @@ from .types import (
|
|
|
38
38
|
ChatData,
|
|
39
39
|
ChunkMode,
|
|
40
40
|
ClassificationResult,
|
|
41
|
-
|
|
41
|
+
ClassifierRule,
|
|
42
|
+
ClassifyJob,
|
|
43
|
+
ClassifyJobResults,
|
|
44
|
+
ClassifyJobWithStatus,
|
|
45
|
+
ClassifyParsingConfiguration,
|
|
46
|
+
CloudAstraDbVectorStore,
|
|
42
47
|
CloudAzStorageBlobDataSource,
|
|
43
48
|
CloudAzureAiSearchVectorStore,
|
|
44
49
|
CloudBoxDataSource,
|
|
@@ -132,13 +137,17 @@ from .types import (
|
|
|
132
137
|
ExtractState,
|
|
133
138
|
ExtractTarget,
|
|
134
139
|
FailPageMode,
|
|
140
|
+
FailureHandlingConfig,
|
|
135
141
|
File,
|
|
142
|
+
FileClassification,
|
|
136
143
|
FileCountByStatusResponse,
|
|
137
144
|
FileData,
|
|
138
145
|
FileIdPresignedUrl,
|
|
139
146
|
FileParsePublic,
|
|
140
147
|
FilePermissionInfoValue,
|
|
141
148
|
FileResourceInfoValue,
|
|
149
|
+
FileStoreInfoResponse,
|
|
150
|
+
FileStoreInfoResponseStatus,
|
|
142
151
|
FilterCondition,
|
|
143
152
|
FilterOperation,
|
|
144
153
|
FilterOperationEq,
|
|
@@ -457,7 +466,12 @@ __all__ = [
|
|
|
457
466
|
"ChatData",
|
|
458
467
|
"ChunkMode",
|
|
459
468
|
"ClassificationResult",
|
|
460
|
-
"
|
|
469
|
+
"ClassifierRule",
|
|
470
|
+
"ClassifyJob",
|
|
471
|
+
"ClassifyJobResults",
|
|
472
|
+
"ClassifyJobWithStatus",
|
|
473
|
+
"ClassifyParsingConfiguration",
|
|
474
|
+
"CloudAstraDbVectorStore",
|
|
461
475
|
"CloudAzStorageBlobDataSource",
|
|
462
476
|
"CloudAzureAiSearchVectorStore",
|
|
463
477
|
"CloudBoxDataSource",
|
|
@@ -572,7 +586,9 @@ __all__ = [
|
|
|
572
586
|
"ExtractStatelessRequestDataSchemaZeroValue",
|
|
573
587
|
"ExtractTarget",
|
|
574
588
|
"FailPageMode",
|
|
589
|
+
"FailureHandlingConfig",
|
|
575
590
|
"File",
|
|
591
|
+
"FileClassification",
|
|
576
592
|
"FileCountByStatusResponse",
|
|
577
593
|
"FileCreateFromUrlResourceInfoValue",
|
|
578
594
|
"FileCreatePermissionInfoValue",
|
|
@@ -582,6 +598,8 @@ __all__ = [
|
|
|
582
598
|
"FileParsePublic",
|
|
583
599
|
"FilePermissionInfoValue",
|
|
584
600
|
"FileResourceInfoValue",
|
|
601
|
+
"FileStoreInfoResponse",
|
|
602
|
+
"FileStoreInfoResponseStatus",
|
|
585
603
|
"FilterCondition",
|
|
586
604
|
"FilterOperation",
|
|
587
605
|
"FilterOperationEq",
|
|
@@ -8,6 +8,7 @@ from ...core.api_error import ApiError
|
|
|
8
8
|
from ...core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
|
|
9
9
|
from ...core.remove_none_from_dict import remove_none_from_dict
|
|
10
10
|
from ...errors.unprocessable_entity_error import UnprocessableEntityError
|
|
11
|
+
from ...types.file_store_info_response import FileStoreInfoResponse
|
|
11
12
|
from ...types.http_validation_error import HttpValidationError
|
|
12
13
|
from ...types.license_info_response import LicenseInfoResponse
|
|
13
14
|
|
|
@@ -53,6 +54,31 @@ class AdminClient:
|
|
|
53
54
|
raise ApiError(status_code=_response.status_code, body=_response.text)
|
|
54
55
|
raise ApiError(status_code=_response.status_code, body=_response_json)
|
|
55
56
|
|
|
57
|
+
def get_file_store_info(self) -> FileStoreInfoResponse:
|
|
58
|
+
"""
|
|
59
|
+
from llama_cloud.client import LlamaCloud
|
|
60
|
+
|
|
61
|
+
client = LlamaCloud(
|
|
62
|
+
token="YOUR_TOKEN",
|
|
63
|
+
)
|
|
64
|
+
client.admin.get_file_store_info()
|
|
65
|
+
"""
|
|
66
|
+
_response = self._client_wrapper.httpx_client.request(
|
|
67
|
+
"GET",
|
|
68
|
+
urllib.parse.urljoin(f"{self._client_wrapper.get_base_url()}/", "api/v1/admin/filestores/info"),
|
|
69
|
+
headers=self._client_wrapper.get_headers(),
|
|
70
|
+
timeout=60,
|
|
71
|
+
)
|
|
72
|
+
if 200 <= _response.status_code < 300:
|
|
73
|
+
return pydantic.parse_obj_as(FileStoreInfoResponse, _response.json()) # type: ignore
|
|
74
|
+
if _response.status_code == 422:
|
|
75
|
+
raise UnprocessableEntityError(pydantic.parse_obj_as(HttpValidationError, _response.json())) # type: ignore
|
|
76
|
+
try:
|
|
77
|
+
_response_json = _response.json()
|
|
78
|
+
except JSONDecodeError:
|
|
79
|
+
raise ApiError(status_code=_response.status_code, body=_response.text)
|
|
80
|
+
raise ApiError(status_code=_response.status_code, body=_response_json)
|
|
81
|
+
|
|
56
82
|
|
|
57
83
|
class AsyncAdminClient:
|
|
58
84
|
def __init__(self, *, client_wrapper: AsyncClientWrapper):
|
|
@@ -86,3 +112,28 @@ class AsyncAdminClient:
|
|
|
86
112
|
except JSONDecodeError:
|
|
87
113
|
raise ApiError(status_code=_response.status_code, body=_response.text)
|
|
88
114
|
raise ApiError(status_code=_response.status_code, body=_response_json)
|
|
115
|
+
|
|
116
|
+
async def get_file_store_info(self) -> FileStoreInfoResponse:
|
|
117
|
+
"""
|
|
118
|
+
from llama_cloud.client import AsyncLlamaCloud
|
|
119
|
+
|
|
120
|
+
client = AsyncLlamaCloud(
|
|
121
|
+
token="YOUR_TOKEN",
|
|
122
|
+
)
|
|
123
|
+
await client.admin.get_file_store_info()
|
|
124
|
+
"""
|
|
125
|
+
_response = await self._client_wrapper.httpx_client.request(
|
|
126
|
+
"GET",
|
|
127
|
+
urllib.parse.urljoin(f"{self._client_wrapper.get_base_url()}/", "api/v1/admin/filestores/info"),
|
|
128
|
+
headers=self._client_wrapper.get_headers(),
|
|
129
|
+
timeout=60,
|
|
130
|
+
)
|
|
131
|
+
if 200 <= _response.status_code < 300:
|
|
132
|
+
return pydantic.parse_obj_as(FileStoreInfoResponse, _response.json()) # type: ignore
|
|
133
|
+
if _response.status_code == 422:
|
|
134
|
+
raise UnprocessableEntityError(pydantic.parse_obj_as(HttpValidationError, _response.json())) # type: ignore
|
|
135
|
+
try:
|
|
136
|
+
_response_json = _response.json()
|
|
137
|
+
except JSONDecodeError:
|
|
138
|
+
raise ApiError(status_code=_response.status_code, body=_response.text)
|
|
139
|
+
raise ApiError(status_code=_response.status_code, body=_response_json)
|
|
@@ -9,7 +9,11 @@ from ...core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
|
|
|
9
9
|
from ...core.jsonable_encoder import jsonable_encoder
|
|
10
10
|
from ...core.remove_none_from_dict import remove_none_from_dict
|
|
11
11
|
from ...errors.unprocessable_entity_error import UnprocessableEntityError
|
|
12
|
-
from ...types.
|
|
12
|
+
from ...types.classifier_rule import ClassifierRule
|
|
13
|
+
from ...types.classify_job import ClassifyJob
|
|
14
|
+
from ...types.classify_job_results import ClassifyJobResults
|
|
15
|
+
from ...types.classify_job_with_status import ClassifyJobWithStatus
|
|
16
|
+
from ...types.classify_parsing_configuration import ClassifyParsingConfiguration
|
|
13
17
|
from ...types.http_validation_error import HttpValidationError
|
|
14
18
|
|
|
15
19
|
try:
|
|
@@ -28,126 +32,149 @@ class ClassifierClient:
|
|
|
28
32
|
def __init__(self, *, client_wrapper: SyncClientWrapper):
|
|
29
33
|
self._client_wrapper = client_wrapper
|
|
30
34
|
|
|
31
|
-
def
|
|
35
|
+
def create_classify_job(
|
|
32
36
|
self,
|
|
33
37
|
*,
|
|
34
38
|
project_id: typing.Optional[str] = None,
|
|
35
39
|
organization_id: typing.Optional[str] = None,
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
enable_metadata_heuristic: typing.Optional[bool] = OMIT,
|
|
41
|
-
) -> ClassifyResponse:
|
|
40
|
+
rules: typing.List[ClassifierRule],
|
|
41
|
+
file_ids: typing.List[str],
|
|
42
|
+
parsing_configuration: typing.Optional[ClassifyParsingConfiguration] = OMIT,
|
|
43
|
+
) -> ClassifyJob:
|
|
42
44
|
"""
|
|
43
|
-
|
|
45
|
+
Create a classify job.
|
|
46
|
+
Experimental: This endpoint is not yet ready for production use and is subject to change at any time.
|
|
44
47
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
This endpoint supports:
|
|
48
|
-
|
|
49
|
-
- Classifying new uploaded files
|
|
50
|
-
- Classifying existing files by ID
|
|
51
|
-
- Both new files and existing file IDs in one request
|
|
52
|
-
|
|
53
|
-
## v0 Features:
|
|
54
|
-
|
|
55
|
-
- **Simplified Rules**: Only `type` and `description` fields needed
|
|
56
|
-
- **Matching Threshold**: Confidence-based classification with configurable threshold
|
|
57
|
-
- **Smart Classification**: Filename heuristics + LLM content analysis
|
|
58
|
-
- **Document Type Filtering**: Automatically filters out non-document file types
|
|
59
|
-
- **Fast Processing**: Uses LlamaParse fast mode + GPT-4.1-nano
|
|
60
|
-
- **Optimized Performance**: Parses each file only once for all rules
|
|
61
|
-
|
|
62
|
-
## Simplified Scoring Logic:
|
|
63
|
-
|
|
64
|
-
1. **Evaluate All Rules**: Compare document against all classification rules
|
|
65
|
-
2. **Best Match Selection**: Return the highest scoring rule above matching_threshold
|
|
66
|
-
3. **Unknown Classification**: Return as "unknown" if no rules score above threshold
|
|
67
|
-
|
|
68
|
-
This ensures optimal classification by:
|
|
69
|
-
|
|
70
|
-
- Finding the best possible match among all rules
|
|
71
|
-
- Avoiding false positives with confidence thresholds
|
|
72
|
-
- Maximizing performance with single-pass file parsing
|
|
48
|
+
Parameters:
|
|
49
|
+
- project_id: typing.Optional[str].
|
|
73
50
|
|
|
74
|
-
|
|
51
|
+
- organization_id: typing.Optional[str].
|
|
75
52
|
|
|
76
|
-
|
|
77
|
-
[
|
|
78
|
-
{
|
|
79
|
-
"type": "invoice",
|
|
80
|
-
"description": "contains invoice number, line items, and total amount"
|
|
81
|
-
},
|
|
82
|
-
{
|
|
83
|
-
"type": "receipt",
|
|
84
|
-
"description": "purchase receipt with transaction details and payment info"
|
|
85
|
-
}
|
|
86
|
-
]
|
|
87
|
-
```
|
|
53
|
+
- rules: typing.List[ClassifierRule]. The rules to classify the files
|
|
88
54
|
|
|
89
|
-
|
|
55
|
+
- file_ids: typing.List[str]. The IDs of the files to classify
|
|
90
56
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
2. **Result**: Returns type, confidence score, and matched rule information
|
|
57
|
+
- parsing_configuration: typing.Optional[ClassifyParsingConfiguration]. The configuration for the parsing job
|
|
58
|
+
---
|
|
59
|
+
from llama_cloud import ClassifyParsingConfiguration, ParserLanguages
|
|
60
|
+
from llama_cloud.client import LlamaCloud
|
|
96
61
|
|
|
97
|
-
|
|
62
|
+
client = LlamaCloud(
|
|
63
|
+
token="YOUR_TOKEN",
|
|
64
|
+
)
|
|
65
|
+
client.classifier.create_classify_job(
|
|
66
|
+
rules=[],
|
|
67
|
+
file_ids=[],
|
|
68
|
+
parsing_configuration=ClassifyParsingConfiguration(
|
|
69
|
+
lang=ParserLanguages.AF,
|
|
70
|
+
),
|
|
71
|
+
)
|
|
72
|
+
"""
|
|
73
|
+
_request: typing.Dict[str, typing.Any] = {"rules": rules, "file_ids": file_ids}
|
|
74
|
+
if parsing_configuration is not OMIT:
|
|
75
|
+
_request["parsing_configuration"] = parsing_configuration
|
|
76
|
+
_response = self._client_wrapper.httpx_client.request(
|
|
77
|
+
"POST",
|
|
78
|
+
urllib.parse.urljoin(f"{self._client_wrapper.get_base_url()}/", "api/v1/classifier/jobs"),
|
|
79
|
+
params=remove_none_from_dict({"project_id": project_id, "organization_id": organization_id}),
|
|
80
|
+
json=jsonable_encoder(_request),
|
|
81
|
+
headers=self._client_wrapper.get_headers(),
|
|
82
|
+
timeout=60,
|
|
83
|
+
)
|
|
84
|
+
if 200 <= _response.status_code < 300:
|
|
85
|
+
return pydantic.parse_obj_as(ClassifyJob, _response.json()) # type: ignore
|
|
86
|
+
if _response.status_code == 422:
|
|
87
|
+
raise UnprocessableEntityError(pydantic.parse_obj_as(HttpValidationError, _response.json())) # type: ignore
|
|
88
|
+
try:
|
|
89
|
+
_response_json = _response.json()
|
|
90
|
+
except JSONDecodeError:
|
|
91
|
+
raise ApiError(status_code=_response.status_code, body=_response.text)
|
|
92
|
+
raise ApiError(status_code=_response.status_code, body=_response_json)
|
|
98
93
|
|
|
99
|
-
|
|
100
|
-
|
|
94
|
+
def get_classify_job(
|
|
95
|
+
self,
|
|
96
|
+
classify_job_id: str,
|
|
97
|
+
*,
|
|
98
|
+
project_id: typing.Optional[str] = None,
|
|
99
|
+
organization_id: typing.Optional[str] = None,
|
|
100
|
+
) -> ClassifyJobWithStatus:
|
|
101
|
+
"""
|
|
102
|
+
Get a classify job.
|
|
103
|
+
Experimental: This endpoint is not yet ready for production use and is subject to change at any time.
|
|
101
104
|
|
|
102
|
-
|
|
105
|
+
Parameters:
|
|
106
|
+
- classify_job_id: str.
|
|
103
107
|
|
|
104
|
-
|
|
105
|
-
**Web Documents**: html, htm, xml
|
|
106
|
-
**Markup**: md, markdown
|
|
108
|
+
- project_id: typing.Optional[str].
|
|
107
109
|
|
|
108
|
-
|
|
110
|
+
- organization_id: typing.Optional[str].
|
|
111
|
+
---
|
|
112
|
+
from llama_cloud.client import LlamaCloud
|
|
109
113
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
+
client = LlamaCloud(
|
|
115
|
+
token="YOUR_TOKEN",
|
|
116
|
+
)
|
|
117
|
+
client.classifier.get_classify_job(
|
|
118
|
+
classify_job_id="string",
|
|
119
|
+
)
|
|
120
|
+
"""
|
|
121
|
+
_response = self._client_wrapper.httpx_client.request(
|
|
122
|
+
"GET",
|
|
123
|
+
urllib.parse.urljoin(
|
|
124
|
+
f"{self._client_wrapper.get_base_url()}/", f"api/v1/classifier/jobs/{classify_job_id}"
|
|
125
|
+
),
|
|
126
|
+
params=remove_none_from_dict({"project_id": project_id, "organization_id": organization_id}),
|
|
127
|
+
headers=self._client_wrapper.get_headers(),
|
|
128
|
+
timeout=60,
|
|
129
|
+
)
|
|
130
|
+
if 200 <= _response.status_code < 300:
|
|
131
|
+
return pydantic.parse_obj_as(ClassifyJobWithStatus, _response.json()) # type: ignore
|
|
132
|
+
if _response.status_code == 422:
|
|
133
|
+
raise UnprocessableEntityError(pydantic.parse_obj_as(HttpValidationError, _response.json())) # type: ignore
|
|
134
|
+
try:
|
|
135
|
+
_response_json = _response.json()
|
|
136
|
+
except JSONDecodeError:
|
|
137
|
+
raise ApiError(status_code=_response.status_code, body=_response.text)
|
|
138
|
+
raise ApiError(status_code=_response.status_code, body=_response_json)
|
|
114
139
|
|
|
115
|
-
|
|
140
|
+
def get_classification_job_results(
|
|
141
|
+
self,
|
|
142
|
+
classify_job_id: str,
|
|
143
|
+
*,
|
|
144
|
+
project_id: typing.Optional[str] = None,
|
|
145
|
+
organization_id: typing.Optional[str] = None,
|
|
146
|
+
) -> ClassifyJobResults:
|
|
147
|
+
"""
|
|
148
|
+
Get the results of a classify job.
|
|
149
|
+
Experimental: This endpoint is not yet ready for production use and is subject to change at any time.
|
|
116
150
|
|
|
117
151
|
Parameters:
|
|
152
|
+
- classify_job_id: str.
|
|
153
|
+
|
|
118
154
|
- project_id: typing.Optional[str].
|
|
119
155
|
|
|
120
156
|
- organization_id: typing.Optional[str].
|
|
157
|
+
---
|
|
158
|
+
from llama_cloud.client import LlamaCloud
|
|
121
159
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
- matching_threshold: typing.Optional[float].
|
|
129
|
-
|
|
130
|
-
- enable_metadata_heuristic: typing.Optional[bool].
|
|
160
|
+
client = LlamaCloud(
|
|
161
|
+
token="YOUR_TOKEN",
|
|
162
|
+
)
|
|
163
|
+
client.classifier.get_classification_job_results(
|
|
164
|
+
classify_job_id="string",
|
|
165
|
+
)
|
|
131
166
|
"""
|
|
132
|
-
_request: typing.Dict[str, typing.Any] = {"rules_json": rules_json}
|
|
133
|
-
if files is not OMIT:
|
|
134
|
-
_request["files"] = files
|
|
135
|
-
if file_ids is not OMIT:
|
|
136
|
-
_request["file_ids"] = file_ids
|
|
137
|
-
if matching_threshold is not OMIT:
|
|
138
|
-
_request["matching_threshold"] = matching_threshold
|
|
139
|
-
if enable_metadata_heuristic is not OMIT:
|
|
140
|
-
_request["enable_metadata_heuristic"] = enable_metadata_heuristic
|
|
141
167
|
_response = self._client_wrapper.httpx_client.request(
|
|
142
|
-
"
|
|
143
|
-
urllib.parse.urljoin(
|
|
168
|
+
"GET",
|
|
169
|
+
urllib.parse.urljoin(
|
|
170
|
+
f"{self._client_wrapper.get_base_url()}/", f"api/v1/classifier/jobs/{classify_job_id}/results"
|
|
171
|
+
),
|
|
144
172
|
params=remove_none_from_dict({"project_id": project_id, "organization_id": organization_id}),
|
|
145
|
-
json=jsonable_encoder(_request),
|
|
146
173
|
headers=self._client_wrapper.get_headers(),
|
|
147
174
|
timeout=60,
|
|
148
175
|
)
|
|
149
176
|
if 200 <= _response.status_code < 300:
|
|
150
|
-
return pydantic.parse_obj_as(
|
|
177
|
+
return pydantic.parse_obj_as(ClassifyJobResults, _response.json()) # type: ignore
|
|
151
178
|
if _response.status_code == 422:
|
|
152
179
|
raise UnprocessableEntityError(pydantic.parse_obj_as(HttpValidationError, _response.json())) # type: ignore
|
|
153
180
|
try:
|
|
@@ -161,126 +188,149 @@ class AsyncClassifierClient:
|
|
|
161
188
|
def __init__(self, *, client_wrapper: AsyncClientWrapper):
|
|
162
189
|
self._client_wrapper = client_wrapper
|
|
163
190
|
|
|
164
|
-
async def
|
|
191
|
+
async def create_classify_job(
|
|
165
192
|
self,
|
|
166
193
|
*,
|
|
167
194
|
project_id: typing.Optional[str] = None,
|
|
168
195
|
organization_id: typing.Optional[str] = None,
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
enable_metadata_heuristic: typing.Optional[bool] = OMIT,
|
|
174
|
-
) -> ClassifyResponse:
|
|
196
|
+
rules: typing.List[ClassifierRule],
|
|
197
|
+
file_ids: typing.List[str],
|
|
198
|
+
parsing_configuration: typing.Optional[ClassifyParsingConfiguration] = OMIT,
|
|
199
|
+
) -> ClassifyJob:
|
|
175
200
|
"""
|
|
176
|
-
|
|
201
|
+
Create a classify job.
|
|
202
|
+
Experimental: This endpoint is not yet ready for production use and is subject to change at any time.
|
|
177
203
|
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
This endpoint supports:
|
|
181
|
-
|
|
182
|
-
- Classifying new uploaded files
|
|
183
|
-
- Classifying existing files by ID
|
|
184
|
-
- Both new files and existing file IDs in one request
|
|
185
|
-
|
|
186
|
-
## v0 Features:
|
|
187
|
-
|
|
188
|
-
- **Simplified Rules**: Only `type` and `description` fields needed
|
|
189
|
-
- **Matching Threshold**: Confidence-based classification with configurable threshold
|
|
190
|
-
- **Smart Classification**: Filename heuristics + LLM content analysis
|
|
191
|
-
- **Document Type Filtering**: Automatically filters out non-document file types
|
|
192
|
-
- **Fast Processing**: Uses LlamaParse fast mode + GPT-4.1-nano
|
|
193
|
-
- **Optimized Performance**: Parses each file only once for all rules
|
|
194
|
-
|
|
195
|
-
## Simplified Scoring Logic:
|
|
196
|
-
|
|
197
|
-
1. **Evaluate All Rules**: Compare document against all classification rules
|
|
198
|
-
2. **Best Match Selection**: Return the highest scoring rule above matching_threshold
|
|
199
|
-
3. **Unknown Classification**: Return as "unknown" if no rules score above threshold
|
|
200
|
-
|
|
201
|
-
This ensures optimal classification by:
|
|
202
|
-
|
|
203
|
-
- Finding the best possible match among all rules
|
|
204
|
-
- Avoiding false positives with confidence thresholds
|
|
205
|
-
- Maximizing performance with single-pass file parsing
|
|
204
|
+
Parameters:
|
|
205
|
+
- project_id: typing.Optional[str].
|
|
206
206
|
|
|
207
|
-
|
|
207
|
+
- organization_id: typing.Optional[str].
|
|
208
208
|
|
|
209
|
-
|
|
210
|
-
[
|
|
211
|
-
{
|
|
212
|
-
"type": "invoice",
|
|
213
|
-
"description": "contains invoice number, line items, and total amount"
|
|
214
|
-
},
|
|
215
|
-
{
|
|
216
|
-
"type": "receipt",
|
|
217
|
-
"description": "purchase receipt with transaction details and payment info"
|
|
218
|
-
}
|
|
219
|
-
]
|
|
220
|
-
```
|
|
209
|
+
- rules: typing.List[ClassifierRule]. The rules to classify the files
|
|
221
210
|
|
|
222
|
-
|
|
211
|
+
- file_ids: typing.List[str]. The IDs of the files to classify
|
|
223
212
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
2. **Result**: Returns type, confidence score, and matched rule information
|
|
213
|
+
- parsing_configuration: typing.Optional[ClassifyParsingConfiguration]. The configuration for the parsing job
|
|
214
|
+
---
|
|
215
|
+
from llama_cloud import ClassifyParsingConfiguration, ParserLanguages
|
|
216
|
+
from llama_cloud.client import AsyncLlamaCloud
|
|
229
217
|
|
|
230
|
-
|
|
218
|
+
client = AsyncLlamaCloud(
|
|
219
|
+
token="YOUR_TOKEN",
|
|
220
|
+
)
|
|
221
|
+
await client.classifier.create_classify_job(
|
|
222
|
+
rules=[],
|
|
223
|
+
file_ids=[],
|
|
224
|
+
parsing_configuration=ClassifyParsingConfiguration(
|
|
225
|
+
lang=ParserLanguages.AF,
|
|
226
|
+
),
|
|
227
|
+
)
|
|
228
|
+
"""
|
|
229
|
+
_request: typing.Dict[str, typing.Any] = {"rules": rules, "file_ids": file_ids}
|
|
230
|
+
if parsing_configuration is not OMIT:
|
|
231
|
+
_request["parsing_configuration"] = parsing_configuration
|
|
232
|
+
_response = await self._client_wrapper.httpx_client.request(
|
|
233
|
+
"POST",
|
|
234
|
+
urllib.parse.urljoin(f"{self._client_wrapper.get_base_url()}/", "api/v1/classifier/jobs"),
|
|
235
|
+
params=remove_none_from_dict({"project_id": project_id, "organization_id": organization_id}),
|
|
236
|
+
json=jsonable_encoder(_request),
|
|
237
|
+
headers=self._client_wrapper.get_headers(),
|
|
238
|
+
timeout=60,
|
|
239
|
+
)
|
|
240
|
+
if 200 <= _response.status_code < 300:
|
|
241
|
+
return pydantic.parse_obj_as(ClassifyJob, _response.json()) # type: ignore
|
|
242
|
+
if _response.status_code == 422:
|
|
243
|
+
raise UnprocessableEntityError(pydantic.parse_obj_as(HttpValidationError, _response.json())) # type: ignore
|
|
244
|
+
try:
|
|
245
|
+
_response_json = _response.json()
|
|
246
|
+
except JSONDecodeError:
|
|
247
|
+
raise ApiError(status_code=_response.status_code, body=_response.text)
|
|
248
|
+
raise ApiError(status_code=_response.status_code, body=_response_json)
|
|
231
249
|
|
|
232
|
-
|
|
233
|
-
|
|
250
|
+
async def get_classify_job(
|
|
251
|
+
self,
|
|
252
|
+
classify_job_id: str,
|
|
253
|
+
*,
|
|
254
|
+
project_id: typing.Optional[str] = None,
|
|
255
|
+
organization_id: typing.Optional[str] = None,
|
|
256
|
+
) -> ClassifyJobWithStatus:
|
|
257
|
+
"""
|
|
258
|
+
Get a classify job.
|
|
259
|
+
Experimental: This endpoint is not yet ready for production use and is subject to change at any time.
|
|
234
260
|
|
|
235
|
-
|
|
261
|
+
Parameters:
|
|
262
|
+
- classify_job_id: str.
|
|
236
263
|
|
|
237
|
-
|
|
238
|
-
**Web Documents**: html, htm, xml
|
|
239
|
-
**Markup**: md, markdown
|
|
264
|
+
- project_id: typing.Optional[str].
|
|
240
265
|
|
|
241
|
-
|
|
266
|
+
- organization_id: typing.Optional[str].
|
|
267
|
+
---
|
|
268
|
+
from llama_cloud.client import AsyncLlamaCloud
|
|
242
269
|
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
270
|
+
client = AsyncLlamaCloud(
|
|
271
|
+
token="YOUR_TOKEN",
|
|
272
|
+
)
|
|
273
|
+
await client.classifier.get_classify_job(
|
|
274
|
+
classify_job_id="string",
|
|
275
|
+
)
|
|
276
|
+
"""
|
|
277
|
+
_response = await self._client_wrapper.httpx_client.request(
|
|
278
|
+
"GET",
|
|
279
|
+
urllib.parse.urljoin(
|
|
280
|
+
f"{self._client_wrapper.get_base_url()}/", f"api/v1/classifier/jobs/{classify_job_id}"
|
|
281
|
+
),
|
|
282
|
+
params=remove_none_from_dict({"project_id": project_id, "organization_id": organization_id}),
|
|
283
|
+
headers=self._client_wrapper.get_headers(),
|
|
284
|
+
timeout=60,
|
|
285
|
+
)
|
|
286
|
+
if 200 <= _response.status_code < 300:
|
|
287
|
+
return pydantic.parse_obj_as(ClassifyJobWithStatus, _response.json()) # type: ignore
|
|
288
|
+
if _response.status_code == 422:
|
|
289
|
+
raise UnprocessableEntityError(pydantic.parse_obj_as(HttpValidationError, _response.json())) # type: ignore
|
|
290
|
+
try:
|
|
291
|
+
_response_json = _response.json()
|
|
292
|
+
except JSONDecodeError:
|
|
293
|
+
raise ApiError(status_code=_response.status_code, body=_response.text)
|
|
294
|
+
raise ApiError(status_code=_response.status_code, body=_response_json)
|
|
247
295
|
|
|
248
|
-
|
|
296
|
+
async def get_classification_job_results(
|
|
297
|
+
self,
|
|
298
|
+
classify_job_id: str,
|
|
299
|
+
*,
|
|
300
|
+
project_id: typing.Optional[str] = None,
|
|
301
|
+
organization_id: typing.Optional[str] = None,
|
|
302
|
+
) -> ClassifyJobResults:
|
|
303
|
+
"""
|
|
304
|
+
Get the results of a classify job.
|
|
305
|
+
Experimental: This endpoint is not yet ready for production use and is subject to change at any time.
|
|
249
306
|
|
|
250
307
|
Parameters:
|
|
308
|
+
- classify_job_id: str.
|
|
309
|
+
|
|
251
310
|
- project_id: typing.Optional[str].
|
|
252
311
|
|
|
253
312
|
- organization_id: typing.Optional[str].
|
|
313
|
+
---
|
|
314
|
+
from llama_cloud.client import AsyncLlamaCloud
|
|
254
315
|
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
- matching_threshold: typing.Optional[float].
|
|
262
|
-
|
|
263
|
-
- enable_metadata_heuristic: typing.Optional[bool].
|
|
316
|
+
client = AsyncLlamaCloud(
|
|
317
|
+
token="YOUR_TOKEN",
|
|
318
|
+
)
|
|
319
|
+
await client.classifier.get_classification_job_results(
|
|
320
|
+
classify_job_id="string",
|
|
321
|
+
)
|
|
264
322
|
"""
|
|
265
|
-
_request: typing.Dict[str, typing.Any] = {"rules_json": rules_json}
|
|
266
|
-
if files is not OMIT:
|
|
267
|
-
_request["files"] = files
|
|
268
|
-
if file_ids is not OMIT:
|
|
269
|
-
_request["file_ids"] = file_ids
|
|
270
|
-
if matching_threshold is not OMIT:
|
|
271
|
-
_request["matching_threshold"] = matching_threshold
|
|
272
|
-
if enable_metadata_heuristic is not OMIT:
|
|
273
|
-
_request["enable_metadata_heuristic"] = enable_metadata_heuristic
|
|
274
323
|
_response = await self._client_wrapper.httpx_client.request(
|
|
275
|
-
"
|
|
276
|
-
urllib.parse.urljoin(
|
|
324
|
+
"GET",
|
|
325
|
+
urllib.parse.urljoin(
|
|
326
|
+
f"{self._client_wrapper.get_base_url()}/", f"api/v1/classifier/jobs/{classify_job_id}/results"
|
|
327
|
+
),
|
|
277
328
|
params=remove_none_from_dict({"project_id": project_id, "organization_id": organization_id}),
|
|
278
|
-
json=jsonable_encoder(_request),
|
|
279
329
|
headers=self._client_wrapper.get_headers(),
|
|
280
330
|
timeout=60,
|
|
281
331
|
)
|
|
282
332
|
if 200 <= _response.status_code < 300:
|
|
283
|
-
return pydantic.parse_obj_as(
|
|
333
|
+
return pydantic.parse_obj_as(ClassifyJobResults, _response.json()) # type: ignore
|
|
284
334
|
if _response.status_code == 422:
|
|
285
335
|
raise UnprocessableEntityError(pydantic.parse_obj_as(HttpValidationError, _response.json())) # type: ignore
|
|
286
336
|
try:
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
import typing
|
|
4
4
|
|
|
5
|
+
from ....types.cloud_astra_db_vector_store import CloudAstraDbVectorStore
|
|
5
6
|
from ....types.cloud_azure_ai_search_vector_store import CloudAzureAiSearchVectorStore
|
|
6
7
|
from ....types.cloud_milvus_vector_store import CloudMilvusVectorStore
|
|
7
8
|
from ....types.cloud_mongo_db_atlas_vector_search import CloudMongoDbAtlasVectorSearch
|
|
@@ -17,4 +18,5 @@ DataSinkUpdateComponent = typing.Union[
|
|
|
17
18
|
CloudAzureAiSearchVectorStore,
|
|
18
19
|
CloudMongoDbAtlasVectorSearch,
|
|
19
20
|
CloudMilvusVectorStore,
|
|
21
|
+
CloudAstraDbVectorStore,
|
|
20
22
|
]
|