groundx 2.0.15__py3-none-any.whl → 2.7.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- groundx/__init__.py +73 -21
- groundx/buckets/__init__.py +2 -0
- groundx/buckets/client.py +55 -388
- groundx/buckets/raw_client.py +628 -0
- groundx/client.py +22 -21
- groundx/core/__init__.py +5 -0
- groundx/core/api_error.py +13 -5
- groundx/core/client_wrapper.py +4 -3
- groundx/core/force_multipart.py +16 -0
- groundx/core/http_client.py +76 -32
- groundx/core/http_response.py +55 -0
- groundx/core/jsonable_encoder.py +0 -1
- groundx/core/pydantic_utilities.py +71 -112
- groundx/core/serialization.py +7 -3
- groundx/csv_splitter.py +64 -0
- groundx/customer/__init__.py +2 -0
- groundx/customer/client.py +31 -43
- groundx/customer/raw_client.py +91 -0
- groundx/documents/__init__.py +1 -2
- groundx/documents/client.py +455 -953
- groundx/documents/raw_client.py +1450 -0
- groundx/errors/__init__.py +2 -0
- groundx/errors/bad_request_error.py +4 -3
- groundx/errors/unauthorized_error.py +4 -3
- groundx/extract/__init__.py +48 -0
- groundx/extract/agents/__init__.py +7 -0
- groundx/extract/agents/agent.py +202 -0
- groundx/extract/classes/__init__.py +24 -0
- groundx/extract/classes/agent.py +23 -0
- groundx/extract/classes/api.py +15 -0
- groundx/extract/classes/document.py +338 -0
- groundx/extract/classes/field.py +88 -0
- groundx/extract/classes/groundx.py +147 -0
- groundx/extract/classes/prompt.py +36 -0
- groundx/extract/classes/test_document.py +109 -0
- groundx/extract/classes/test_field.py +43 -0
- groundx/extract/classes/test_groundx.py +223 -0
- groundx/extract/classes/test_prompt.py +68 -0
- groundx/extract/post_process/__init__.py +7 -0
- groundx/extract/post_process/post_process.py +33 -0
- groundx/extract/services/.DS_Store +0 -0
- groundx/extract/services/__init__.py +14 -0
- groundx/extract/services/csv.py +76 -0
- groundx/extract/services/logger.py +126 -0
- groundx/extract/services/logging_cfg.py +53 -0
- groundx/extract/services/ratelimit.py +104 -0
- groundx/extract/services/sheets_client.py +160 -0
- groundx/extract/services/status.py +197 -0
- groundx/extract/services/upload.py +68 -0
- groundx/extract/services/upload_minio.py +122 -0
- groundx/extract/services/upload_s3.py +91 -0
- groundx/extract/services/utility.py +52 -0
- groundx/extract/settings/__init__.py +15 -0
- groundx/extract/settings/settings.py +212 -0
- groundx/extract/settings/test_settings.py +512 -0
- groundx/extract/tasks/__init__.py +6 -0
- groundx/extract/tasks/utility.py +27 -0
- groundx/extract/utility/__init__.py +15 -0
- groundx/extract/utility/classes.py +193 -0
- groundx/extract/utility/test_utility.py +81 -0
- groundx/groups/__init__.py +2 -0
- groundx/groups/client.py +63 -550
- groundx/groups/raw_client.py +901 -0
- groundx/health/__init__.py +2 -0
- groundx/health/client.py +35 -101
- groundx/health/raw_client.py +193 -0
- groundx/ingest.py +771 -0
- groundx/search/__init__.py +2 -0
- groundx/search/client.py +94 -227
- groundx/search/raw_client.py +442 -0
- groundx/search/types/__init__.py +2 -0
- groundx/types/__init__.py +68 -16
- groundx/types/bounding_box_detail.py +4 -4
- groundx/types/bucket_detail.py +5 -5
- groundx/types/bucket_list_response.py +17 -3
- groundx/types/bucket_response.py +3 -3
- groundx/types/bucket_update_detail.py +4 -4
- groundx/types/bucket_update_response.py +3 -3
- groundx/types/customer_detail.py +2 -2
- groundx/types/customer_response.py +3 -3
- groundx/types/document.py +54 -0
- groundx/types/document_detail.py +16 -4
- groundx/types/document_list_response.py +4 -4
- groundx/types/document_local_ingest_request.py +7 -0
- groundx/types/document_lookup_response.py +8 -3
- groundx/types/document_response.py +3 -3
- groundx/types/document_type.py +21 -1
- groundx/types/group_detail.py +4 -4
- groundx/types/group_list_response.py +17 -3
- groundx/types/group_response.py +3 -3
- groundx/types/health_response.py +3 -3
- groundx/types/health_response_health.py +3 -3
- groundx/types/health_service.py +5 -5
- groundx/types/ingest_local_document.py +25 -0
- groundx/types/ingest_local_document_metadata.py +51 -0
- groundx/types/ingest_remote_document.py +15 -6
- groundx/types/ingest_response.py +4 -4
- groundx/types/{process_status_response_ingest.py → ingest_status.py} +8 -7
- groundx/types/{ingest_response_ingest.py → ingest_status_light.py} +7 -5
- groundx/types/ingest_status_progress.py +26 -0
- groundx/types/{process_status_response_ingest_progress_errors.py → ingest_status_progress_cancelled.py} +4 -4
- groundx/types/{process_status_response_ingest_progress_complete.py → ingest_status_progress_complete.py} +4 -4
- groundx/types/{process_status_response_ingest_progress_cancelled.py → ingest_status_progress_errors.py} +4 -4
- groundx/types/{process_status_response_ingest_progress_processing.py → ingest_status_progress_processing.py} +4 -4
- groundx/types/message_response.py +2 -2
- groundx/types/meter_detail.py +2 -2
- groundx/types/process_level.py +5 -0
- groundx/types/{process_status_response.py → processes_status_response.py} +8 -5
- groundx/types/processing_status.py +3 -1
- groundx/types/search_response.py +3 -3
- groundx/types/search_response_search.py +3 -3
- groundx/types/search_result_item.py +7 -5
- groundx/types/search_result_item_pages_item.py +41 -0
- groundx/types/subscription_detail.py +3 -3
- groundx/types/subscription_detail_meters.py +5 -5
- groundx/{documents/types/website_crawl_request_websites_item.py → types/website_source.py} +7 -7
- groundx/types/workflow_apply_request.py +24 -0
- groundx/types/workflow_detail.py +59 -0
- groundx/types/workflow_detail_chunk_strategy.py +5 -0
- groundx/types/workflow_detail_relationships.py +36 -0
- groundx/types/workflow_engine.py +58 -0
- groundx/types/workflow_engine_reasoning_effort.py +5 -0
- groundx/types/workflow_engine_service.py +7 -0
- groundx/types/workflow_prompt.py +37 -0
- groundx/types/workflow_prompt_group.py +25 -0
- groundx/types/workflow_prompt_role.py +5 -0
- groundx/types/workflow_request.py +31 -0
- groundx/types/workflow_request_chunk_strategy.py +5 -0
- groundx/types/workflow_response.py +20 -0
- groundx/types/workflow_step.py +33 -0
- groundx/types/workflow_step_config.py +33 -0
- groundx/types/workflow_step_config_field.py +8 -0
- groundx/types/workflow_steps.py +38 -0
- groundx/types/workflows_response.py +20 -0
- groundx/workflows/__init__.py +7 -0
- groundx/workflows/client.py +736 -0
- groundx/workflows/raw_client.py +841 -0
- groundx/workflows/types/__init__.py +7 -0
- groundx/workflows/types/workflows_get_request_id.py +5 -0
- {groundx-2.0.15.dist-info → groundx-2.7.7.dist-info}/LICENSE +1 -1
- {groundx-2.0.15.dist-info → groundx-2.7.7.dist-info}/METADATA +39 -22
- groundx-2.7.7.dist-info/RECORD +155 -0
- groundx/documents/types/__init__.py +0 -6
- groundx/documents/types/documents_ingest_local_request_files_item.py +0 -43
- groundx/types/process_status_response_ingest_progress.py +0 -26
- groundx-2.0.15.dist-info/RECORD +0 -82
- {groundx-2.0.15.dist-info → groundx-2.7.7.dist-info}/WHEEL +0 -0
groundx/errors/__init__.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
# This file was auto-generated by Fern from our API Definition.
|
|
2
2
|
|
|
3
|
-
from ..core.api_error import ApiError
|
|
4
3
|
import typing
|
|
5
4
|
|
|
5
|
+
from ..core.api_error import ApiError
|
|
6
|
+
|
|
6
7
|
|
|
7
8
|
class BadRequestError(ApiError):
|
|
8
|
-
def __init__(self, body: typing.Optional[typing.Any]):
|
|
9
|
-
super().__init__(status_code=400, body=body)
|
|
9
|
+
def __init__(self, body: typing.Optional[typing.Any], headers: typing.Optional[typing.Dict[str, str]] = None):
|
|
10
|
+
super().__init__(status_code=400, headers=headers, body=body)
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
# This file was auto-generated by Fern from our API Definition.
|
|
2
2
|
|
|
3
|
-
from ..core.api_error import ApiError
|
|
4
3
|
import typing
|
|
5
4
|
|
|
5
|
+
from ..core.api_error import ApiError
|
|
6
|
+
|
|
6
7
|
|
|
7
8
|
class UnauthorizedError(ApiError):
|
|
8
|
-
def __init__(self, body: typing.Optional[typing.Any]):
|
|
9
|
-
super().__init__(status_code=401, body=body)
|
|
9
|
+
def __init__(self, body: typing.Optional[typing.Any], headers: typing.Optional[typing.Dict[str, str]] = None):
|
|
10
|
+
super().__init__(status_code=401, headers=headers, body=body)
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from .agents import AgentCode, AgentTool
|
|
2
|
+
from .classes import (
|
|
3
|
+
AgentRequest,
|
|
4
|
+
Document,
|
|
5
|
+
DocumentRequest,
|
|
6
|
+
ExtractedField,
|
|
7
|
+
GroundXDocument,
|
|
8
|
+
ProcessResponse,
|
|
9
|
+
Prompt,
|
|
10
|
+
TestChunk,
|
|
11
|
+
TestDocumentPage,
|
|
12
|
+
TestField,
|
|
13
|
+
TestXRay,
|
|
14
|
+
XRayDocument,
|
|
15
|
+
)
|
|
16
|
+
from .services import Logger, RateLimit, SheetsClient, Status, Upload
|
|
17
|
+
from .settings import (
|
|
18
|
+
AgentSettings,
|
|
19
|
+
ContainerSettings,
|
|
20
|
+
ContainerUploadSettings,
|
|
21
|
+
GroundXSettings,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"AgentCode",
|
|
26
|
+
"AgentRequest",
|
|
27
|
+
"AgentSettings",
|
|
28
|
+
"AgentTool",
|
|
29
|
+
"ContainerSettings",
|
|
30
|
+
"ContainerUploadSettings",
|
|
31
|
+
"Document",
|
|
32
|
+
"DocumentRequest",
|
|
33
|
+
"ExtractedField",
|
|
34
|
+
"GroundXDocument",
|
|
35
|
+
"GroundXSettings",
|
|
36
|
+
"Logger",
|
|
37
|
+
"ProcessResponse",
|
|
38
|
+
"Prompt",
|
|
39
|
+
"RateLimit",
|
|
40
|
+
"SheetsClient",
|
|
41
|
+
"Status",
|
|
42
|
+
"TestChunk",
|
|
43
|
+
"TestDocumentPage",
|
|
44
|
+
"TestField",
|
|
45
|
+
"TestXRay",
|
|
46
|
+
"Upload",
|
|
47
|
+
"XRayDocument",
|
|
48
|
+
]
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
import json, pytest, traceback, typing
|
|
2
|
+
|
|
3
|
+
pytest.importorskip("PIL")
|
|
4
|
+
|
|
5
|
+
from PIL.Image import Image
|
|
6
|
+
|
|
7
|
+
from smolagents import ( # pyright: ignore[reportMissingTypeStubs]
|
|
8
|
+
CodeAgent,
|
|
9
|
+
Tool,
|
|
10
|
+
ToolCallingAgent,
|
|
11
|
+
)
|
|
12
|
+
from smolagents.models import ( # pyright: ignore[reportMissingTypeStubs]
|
|
13
|
+
OpenAIServerModel,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
from ..services.logger import Logger
|
|
17
|
+
from ..settings.settings import AgentSettings
|
|
18
|
+
from ..utility.classes import clean_json
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
prompt_suffix = """
|
|
22
|
+
Return only your response using the `final_answer` tool format:
|
|
23
|
+
|
|
24
|
+
```json
|
|
25
|
+
{{"answer": {{"type": RESPONSE_HERE, "description": "The final answer to the problem"}}}}
|
|
26
|
+
```
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def extract_response(res: typing.Dict[str, typing.Any]) -> typing.Any:
|
|
31
|
+
if "answer" in res and "type" in res["answer"]:
|
|
32
|
+
return res["answer"]["type"]
|
|
33
|
+
|
|
34
|
+
if "type" in res:
|
|
35
|
+
return res["type"]
|
|
36
|
+
|
|
37
|
+
return res
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def process_response(
|
|
41
|
+
res: typing.Any,
|
|
42
|
+
expected_types: typing.Union[type, typing.Tuple[type, ...]] = dict,
|
|
43
|
+
) -> typing.Any:
|
|
44
|
+
if not isinstance(res, expected_types):
|
|
45
|
+
if (
|
|
46
|
+
isinstance(res, list)
|
|
47
|
+
and isinstance(dict(), expected_types)
|
|
48
|
+
and len(res) == 1 # pyright: ignore[reportUnknownArgumentType]
|
|
49
|
+
):
|
|
50
|
+
return extract_response(
|
|
51
|
+
res[0] # pyright: ignore[reportUnknownArgumentType]
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
if not isinstance(res, str):
|
|
55
|
+
traceback.print_stack()
|
|
56
|
+
raise TypeError(
|
|
57
|
+
f"agent process result is not of expected type(s) {expected_types!r}, got {type(res)!r}" # type: ignore
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
res = clean_json(res)
|
|
61
|
+
|
|
62
|
+
loaded = json.loads(res)
|
|
63
|
+
if not isinstance(loaded, expected_types):
|
|
64
|
+
if isinstance(loaded, list) and isinstance(dict(), expected_types) and len(loaded) == 1: # type: ignore
|
|
65
|
+
return extract_response(loaded[0]) # type: ignore
|
|
66
|
+
|
|
67
|
+
traceback.print_stack()
|
|
68
|
+
raise TypeError(
|
|
69
|
+
f"agent process result is not of expected type(s) {expected_types!r} after JSON parsing, got {type(loaded)!r}" # type: ignore
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
if isinstance(loaded, typing.Dict):
|
|
73
|
+
return extract_response(loaded) # type: ignore
|
|
74
|
+
|
|
75
|
+
return loaded
|
|
76
|
+
|
|
77
|
+
if isinstance(res, typing.Dict):
|
|
78
|
+
return extract_response(res) # type: ignore
|
|
79
|
+
|
|
80
|
+
return res
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class AgentCode(CodeAgent):
|
|
84
|
+
def __init__(
|
|
85
|
+
self,
|
|
86
|
+
settings: AgentSettings,
|
|
87
|
+
log: Logger,
|
|
88
|
+
name: typing.Optional[str] = None,
|
|
89
|
+
description: typing.Optional[str] = None,
|
|
90
|
+
tools: typing.Optional[typing.List[Tool]] = None,
|
|
91
|
+
verbosity: typing.Optional[int] = 0,
|
|
92
|
+
):
|
|
93
|
+
if tools is None:
|
|
94
|
+
tools = []
|
|
95
|
+
|
|
96
|
+
model = OpenAIServerModel(
|
|
97
|
+
model_id=settings.model_id,
|
|
98
|
+
api_base=settings.api_base,
|
|
99
|
+
api_key=settings.get_api_key(),
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
super().__init__( # pyright: ignore[reportUnknownMemberType]
|
|
103
|
+
name=name,
|
|
104
|
+
description=description,
|
|
105
|
+
additional_authorized_imports=settings.imports,
|
|
106
|
+
tools=tools,
|
|
107
|
+
model=model,
|
|
108
|
+
max_steps=settings.max_steps,
|
|
109
|
+
verbosity_level=verbosity,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
if self.python_executor.static_tools is None: # type: ignore
|
|
113
|
+
self.python_executor.static_tools = {} # type: ignore
|
|
114
|
+
|
|
115
|
+
self.python_executor.static_tools.update({"open": open}) # type: ignore
|
|
116
|
+
|
|
117
|
+
self.log = log
|
|
118
|
+
|
|
119
|
+
def process(
|
|
120
|
+
self,
|
|
121
|
+
conflict: str,
|
|
122
|
+
images: typing.List[Image],
|
|
123
|
+
expected_types: typing.Union[type, typing.Tuple[type, ...]] = dict,
|
|
124
|
+
attempt: int = 0,
|
|
125
|
+
) -> typing.Any:
|
|
126
|
+
res = super().run( # pyright: ignore[reportUnknownMemberType]
|
|
127
|
+
conflict + prompt_suffix,
|
|
128
|
+
images=images,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
try:
|
|
132
|
+
return process_response(res=res, expected_types=expected_types)
|
|
133
|
+
|
|
134
|
+
except Exception as e:
|
|
135
|
+
if attempt > 2:
|
|
136
|
+
raise TypeError(
|
|
137
|
+
f"agent process result is not of expected type(s) {expected_types!r}: [{e}]\n\n{res}"
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
self.log.debug_msg(
|
|
141
|
+
f"agent process result is not of expected type(s) {expected_types!r}: [{e}], attempting again [{attempt+1}]\n\n{res}"
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
return self.process(conflict, images, expected_types, attempt + 1)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class AgentTool(ToolCallingAgent):
|
|
148
|
+
def __init__(
|
|
149
|
+
self,
|
|
150
|
+
settings: AgentSettings,
|
|
151
|
+
log: Logger,
|
|
152
|
+
name: typing.Optional[str] = None,
|
|
153
|
+
description: typing.Optional[str] = None,
|
|
154
|
+
tools: typing.Optional[typing.List[Tool]] = None,
|
|
155
|
+
verbosity: typing.Optional[int] = 0,
|
|
156
|
+
):
|
|
157
|
+
if tools is None:
|
|
158
|
+
tools = []
|
|
159
|
+
|
|
160
|
+
model = OpenAIServerModel(
|
|
161
|
+
model_id=settings.model_id,
|
|
162
|
+
api_base=settings.api_base,
|
|
163
|
+
api_key=settings.get_api_key(),
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
super().__init__( # pyright: ignore[reportUnknownMemberType]
|
|
167
|
+
name=name,
|
|
168
|
+
description=description,
|
|
169
|
+
tools=tools,
|
|
170
|
+
model=model,
|
|
171
|
+
max_steps=settings.max_steps,
|
|
172
|
+
verbosity_level=verbosity,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
self.log = log
|
|
176
|
+
|
|
177
|
+
def process(
|
|
178
|
+
self,
|
|
179
|
+
conflict: str,
|
|
180
|
+
images: typing.List[Image],
|
|
181
|
+
expected_types: typing.Union[type, typing.Tuple[type, ...]] = dict,
|
|
182
|
+
attempt: int = 0,
|
|
183
|
+
) -> typing.Any:
|
|
184
|
+
res = super().run( # pyright: ignore[reportUnknownMemberType]
|
|
185
|
+
conflict + prompt_suffix,
|
|
186
|
+
images=images,
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
try:
|
|
190
|
+
return process_response(res=res, expected_types=expected_types)
|
|
191
|
+
|
|
192
|
+
except Exception as e:
|
|
193
|
+
if attempt > 2:
|
|
194
|
+
raise TypeError(
|
|
195
|
+
f"agent process result is not of expected type(s) {expected_types!r}: [{e}]\n\n{res}"
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
print(
|
|
199
|
+
f"agent process result is not of expected type(s) {expected_types!r}: [{e}], attempting again [{attempt+1}]\n\n{res}"
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
return self.process(conflict, images, expected_types, attempt + 1)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from .agent import AgentRequest
|
|
2
|
+
from .api import ProcessResponse
|
|
3
|
+
from .document import Document, DocumentRequest
|
|
4
|
+
from .field import ExtractedField
|
|
5
|
+
from .groundx import GroundXDocument, XRayDocument
|
|
6
|
+
from .prompt import Prompt
|
|
7
|
+
from .test_field import TestField
|
|
8
|
+
from .test_groundx import TestChunk, TestDocumentPage, TestXRay
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"AgentRequest",
|
|
13
|
+
"Document",
|
|
14
|
+
"DocumentRequest",
|
|
15
|
+
"ExtractedField",
|
|
16
|
+
"GroundXDocument",
|
|
17
|
+
"ProcessResponse",
|
|
18
|
+
"Prompt",
|
|
19
|
+
"TestChunk",
|
|
20
|
+
"TestDocumentPage",
|
|
21
|
+
"TestField",
|
|
22
|
+
"TestXRay",
|
|
23
|
+
"XRayDocument",
|
|
24
|
+
]
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import typing
|
|
2
|
+
from pydantic import BaseModel, field_validator
|
|
3
|
+
|
|
4
|
+
from .document import Document, DocumentRequest
|
|
5
|
+
|
|
6
|
+
ReqT = typing.TypeVar("ReqT", bound=DocumentRequest)
|
|
7
|
+
DocT = typing.TypeVar("DocT", bound=Document)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AgentRequest(BaseModel, typing.Generic[ReqT, DocT]):
|
|
11
|
+
allowed_request_types: typing.ClassVar[typing.List[str]] = []
|
|
12
|
+
request: ReqT
|
|
13
|
+
request_type: str
|
|
14
|
+
statement: DocT
|
|
15
|
+
|
|
16
|
+
@field_validator("request_type")
|
|
17
|
+
@classmethod
|
|
18
|
+
def validate_request_type(cls, value: str):
|
|
19
|
+
if value not in cls.allowed_request_types:
|
|
20
|
+
raise ValueError(
|
|
21
|
+
f"Invalid request_type '{value}'. Must be one of {cls.allowed_request_types}"
|
|
22
|
+
)
|
|
23
|
+
return value
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ErrorResponse(BaseModel):
|
|
6
|
+
model_config = ConfigDict(populate_by_name=True)
|
|
7
|
+
code: int
|
|
8
|
+
document_id: str = Field(alias="documentID")
|
|
9
|
+
message: str
|
|
10
|
+
task_id: str = Field(alias="taskID")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class ProcessResponse:
|
|
15
|
+
message: str
|
|
@@ -0,0 +1,338 @@
|
|
|
1
|
+
import json, os, shutil, requests, time, typing
|
|
2
|
+
from datetime import datetime, timezone
|
|
3
|
+
from io import BytesIO
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from PIL import Image
|
|
6
|
+
from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
|
|
7
|
+
from urllib.parse import urlparse
|
|
8
|
+
|
|
9
|
+
from .groundx import GroundXDocument
|
|
10
|
+
from ..services.logger import Logger
|
|
11
|
+
from ..services.upload import Upload
|
|
12
|
+
from ..utility.classes import clean_json
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
DocT = typing.TypeVar("DocT", bound="Document")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Document(BaseModel):
|
|
19
|
+
file_name: str = ""
|
|
20
|
+
|
|
21
|
+
document_id: str = ""
|
|
22
|
+
page_images: typing.List[str] = []
|
|
23
|
+
source_url: str = ""
|
|
24
|
+
task_id: str = ""
|
|
25
|
+
|
|
26
|
+
_logger: typing.Optional[Logger] = PrivateAttr(default=None)
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def logger(self) -> typing.Optional[Logger]:
|
|
30
|
+
if self._logger:
|
|
31
|
+
return self._logger
|
|
32
|
+
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
@logger.setter
|
|
36
|
+
def logger(self, value: Logger) -> None:
|
|
37
|
+
self._logger = value
|
|
38
|
+
|
|
39
|
+
@logger.deleter
|
|
40
|
+
def logger(self) -> None:
|
|
41
|
+
del self._logger
|
|
42
|
+
|
|
43
|
+
@classmethod
|
|
44
|
+
def from_request(
|
|
45
|
+
cls: typing.Type[DocT],
|
|
46
|
+
base_url: str,
|
|
47
|
+
cache_dir: Path,
|
|
48
|
+
req: "DocumentRequest",
|
|
49
|
+
upload: typing.Optional[Upload] = None,
|
|
50
|
+
**data: typing.Any,
|
|
51
|
+
) -> DocT:
|
|
52
|
+
st = cls(**data)
|
|
53
|
+
|
|
54
|
+
st.document_id = req.document_id
|
|
55
|
+
st.file_name = req.file_name
|
|
56
|
+
st.task_id = req.task_id
|
|
57
|
+
|
|
58
|
+
xray_doc = GroundXDocument(
|
|
59
|
+
base_url=base_url,
|
|
60
|
+
documentID=req.document_id,
|
|
61
|
+
taskID=req.task_id,
|
|
62
|
+
).xray(upload=upload, cache_dir=cache_dir, clear_cache=req.clear_cache)
|
|
63
|
+
|
|
64
|
+
for page in xray_doc.documentPages:
|
|
65
|
+
st.page_images.append(page.pageUrl)
|
|
66
|
+
|
|
67
|
+
st.source_url = xray_doc.sourceUrl
|
|
68
|
+
|
|
69
|
+
for chunk in xray_doc.chunks:
|
|
70
|
+
stxt = chunk.sectionSummary or "{}"
|
|
71
|
+
stxt = clean_json(stxt)
|
|
72
|
+
try:
|
|
73
|
+
data = json.loads(stxt)
|
|
74
|
+
except json.JSONDecodeError:
|
|
75
|
+
st.print("ERROR", f"\njson.JSONDecodeError stxt\n{stxt}\n\n")
|
|
76
|
+
continue
|
|
77
|
+
|
|
78
|
+
for key, value in data.items():
|
|
79
|
+
err = st.add(key, value)
|
|
80
|
+
if err:
|
|
81
|
+
raise Exception(f"\n\ninit document error:\n\t{err}\n")
|
|
82
|
+
|
|
83
|
+
mtxt = chunk.suggestedText or "{}"
|
|
84
|
+
mtxt = clean_json(mtxt)
|
|
85
|
+
try:
|
|
86
|
+
data = json.loads(mtxt)
|
|
87
|
+
except json.JSONDecodeError:
|
|
88
|
+
st.print("ERROR", f"\njson.JSONDecodeError mtxt\n{mtxt}\n\n")
|
|
89
|
+
continue
|
|
90
|
+
|
|
91
|
+
for key, value in data.items():
|
|
92
|
+
err = st.add(key, value)
|
|
93
|
+
if err:
|
|
94
|
+
raise Exception(f"\n\ninit document error:\n\t{err}\n")
|
|
95
|
+
|
|
96
|
+
st.finalize_init()
|
|
97
|
+
|
|
98
|
+
return st
|
|
99
|
+
|
|
100
|
+
def add(self, k: str, value: typing.Any) -> typing.Union[str, None]:
|
|
101
|
+
self.print("WARNING", "add is not implemented")
|
|
102
|
+
|
|
103
|
+
return None
|
|
104
|
+
|
|
105
|
+
def finalize_init(self) -> None:
|
|
106
|
+
self.print("WARNING", "finalize_init is not implemented")
|
|
107
|
+
|
|
108
|
+
def print(self, level: str, msg: str) -> None:
|
|
109
|
+
if not self.logger:
|
|
110
|
+
print(msg)
|
|
111
|
+
return
|
|
112
|
+
|
|
113
|
+
lvl = level.upper()
|
|
114
|
+
if lvl == "ERROR":
|
|
115
|
+
self.logger.error_msg(msg, self.file_name, self.document_id, self.task_id)
|
|
116
|
+
elif lvl == "INFO":
|
|
117
|
+
self.logger.info_msg(msg, self.file_name, self.document_id, self.task_id)
|
|
118
|
+
elif lvl in ("WARN", "WARNING"):
|
|
119
|
+
self.logger.warning_msg(msg, self.file_name, self.document_id, self.task_id)
|
|
120
|
+
else:
|
|
121
|
+
self.logger.debug_msg(msg, self.file_name, self.document_id, self.task_id)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _new_page_image_dict() -> typing.Dict[str, int]:
|
|
125
|
+
return {}
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _new_page_images() -> typing.List[Image.Image]:
|
|
129
|
+
return []
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class DocumentRequest(BaseModel):
|
|
133
|
+
model_config = ConfigDict(populate_by_name=True)
|
|
134
|
+
callback_url: str = Field(alias="callbackURL", default="")
|
|
135
|
+
document_id: str = Field(alias="documentID")
|
|
136
|
+
file_name: str = Field(alias="fileName")
|
|
137
|
+
model_id: int = Field(alias="modelID")
|
|
138
|
+
processor_id: int = Field(alias="processorID")
|
|
139
|
+
task_id: str = Field(alias="taskID")
|
|
140
|
+
|
|
141
|
+
_logger: typing.Optional[Logger] = PrivateAttr(default=None)
|
|
142
|
+
|
|
143
|
+
_append_values: bool = PrivateAttr(default_factory=bool)
|
|
144
|
+
_clear_cache: bool = PrivateAttr(default_factory=bool)
|
|
145
|
+
_debug_path: typing.Optional[str] = PrivateAttr(default=None)
|
|
146
|
+
_page_image_dict: typing.Dict[str, int] = PrivateAttr(
|
|
147
|
+
default_factory=_new_page_image_dict
|
|
148
|
+
)
|
|
149
|
+
_page_images: typing.List[Image.Image] = PrivateAttr(
|
|
150
|
+
default_factory=_new_page_images
|
|
151
|
+
)
|
|
152
|
+
_start: int = PrivateAttr(
|
|
153
|
+
default_factory=lambda: int(datetime.now(timezone.utc).timestamp())
|
|
154
|
+
)
|
|
155
|
+
_write_lock: typing.Optional[typing.Any] = PrivateAttr(default=None)
|
|
156
|
+
|
|
157
|
+
@property
|
|
158
|
+
def append_values(self) -> bool:
|
|
159
|
+
return self._append_values
|
|
160
|
+
|
|
161
|
+
@append_values.setter
|
|
162
|
+
def append_values(self, value: bool) -> None:
|
|
163
|
+
self._append_values = value
|
|
164
|
+
|
|
165
|
+
@append_values.deleter
|
|
166
|
+
def append_values(self) -> None:
|
|
167
|
+
del self._append_values
|
|
168
|
+
|
|
169
|
+
@property
|
|
170
|
+
def clear_cache(self) -> bool:
|
|
171
|
+
return self._clear_cache
|
|
172
|
+
|
|
173
|
+
@clear_cache.setter
|
|
174
|
+
def clear_cache(self, value: bool) -> None:
|
|
175
|
+
self._clear_cache = value
|
|
176
|
+
|
|
177
|
+
@clear_cache.deleter
|
|
178
|
+
def clear_cache(self) -> None:
|
|
179
|
+
del self._clear_cache
|
|
180
|
+
|
|
181
|
+
@property
|
|
182
|
+
def debug_path(self) -> typing.Optional[str]:
|
|
183
|
+
return self._debug_path
|
|
184
|
+
|
|
185
|
+
@debug_path.setter
|
|
186
|
+
def debug_path(self, value: str) -> None:
|
|
187
|
+
self._debug_path = value
|
|
188
|
+
|
|
189
|
+
@debug_path.deleter
|
|
190
|
+
def debug_path(self) -> None:
|
|
191
|
+
del self._debug_path
|
|
192
|
+
|
|
193
|
+
@property
|
|
194
|
+
def logger(self) -> typing.Optional[Logger]:
|
|
195
|
+
if self._logger:
|
|
196
|
+
return self._logger
|
|
197
|
+
|
|
198
|
+
return None
|
|
199
|
+
|
|
200
|
+
@logger.setter
|
|
201
|
+
def logger(self, value: Logger) -> None:
|
|
202
|
+
self._logger = value
|
|
203
|
+
|
|
204
|
+
@logger.deleter
|
|
205
|
+
def logger(self) -> None:
|
|
206
|
+
del self._logger
|
|
207
|
+
|
|
208
|
+
@property
|
|
209
|
+
def page_images(self) -> typing.List[Image.Image]:
|
|
210
|
+
return self._page_images
|
|
211
|
+
|
|
212
|
+
@page_images.setter
|
|
213
|
+
def page_images(self, value: typing.List[Image.Image]) -> None:
|
|
214
|
+
self._page_images = value
|
|
215
|
+
|
|
216
|
+
@page_images.deleter
|
|
217
|
+
def page_images(self) -> None:
|
|
218
|
+
del self._page_images
|
|
219
|
+
|
|
220
|
+
@property
|
|
221
|
+
def page_image_dict(self) -> typing.Dict[str, int]:
|
|
222
|
+
return self._page_image_dict
|
|
223
|
+
|
|
224
|
+
@page_image_dict.setter
|
|
225
|
+
def page_image_dict(self, value: typing.Dict[str, int]) -> None:
|
|
226
|
+
self._page_image_dict = value
|
|
227
|
+
|
|
228
|
+
@page_image_dict.deleter
|
|
229
|
+
def page_image_dict(self) -> None:
|
|
230
|
+
del self._page_image_dict
|
|
231
|
+
|
|
232
|
+
@property
|
|
233
|
+
def start(self) -> int:
|
|
234
|
+
return self._start
|
|
235
|
+
|
|
236
|
+
@property
|
|
237
|
+
def write_lock(self) -> typing.Optional[typing.Any]:
|
|
238
|
+
return self._write_lock
|
|
239
|
+
|
|
240
|
+
@write_lock.setter
|
|
241
|
+
def write_lock(self, value: typing.Optional[typing.Any]) -> None:
|
|
242
|
+
self._write_lock = value
|
|
243
|
+
|
|
244
|
+
@write_lock.deleter
|
|
245
|
+
def write_lock(self) -> None:
|
|
246
|
+
del self._write_lock
|
|
247
|
+
|
|
248
|
+
def clear_debug(self) -> None:
|
|
249
|
+
if self.debug_path:
|
|
250
|
+
file_path = f"{self.debug_path}/{self.file_name.replace('.pdf','')}"
|
|
251
|
+
shutil.rmtree(file_path, ignore_errors=True)
|
|
252
|
+
|
|
253
|
+
def load_images(
|
|
254
|
+
self,
|
|
255
|
+
imgs: typing.List[str],
|
|
256
|
+
upload: typing.Optional[Upload] = None,
|
|
257
|
+
attempt: int = 0,
|
|
258
|
+
should_sleep: bool = True,
|
|
259
|
+
) -> typing.List[Image.Image]:
|
|
260
|
+
pageImages: typing.List[Image.Image] = []
|
|
261
|
+
for page in imgs:
|
|
262
|
+
if page in self.page_image_dict:
|
|
263
|
+
self.print(
|
|
264
|
+
"WARN",
|
|
265
|
+
f"[{attempt}] loading cached [{self.page_image_dict[page]}] [{page}]",
|
|
266
|
+
)
|
|
267
|
+
pageImages.append(self.page_images[self.page_image_dict[page]])
|
|
268
|
+
continue
|
|
269
|
+
|
|
270
|
+
if upload:
|
|
271
|
+
parsed = urlparse(page)
|
|
272
|
+
path = parsed.path + ("?" + parsed.query if parsed.query else "")
|
|
273
|
+
ru = upload.get_object(path)
|
|
274
|
+
if ru:
|
|
275
|
+
img = Image.open(BytesIO(ru))
|
|
276
|
+
if img:
|
|
277
|
+
self.page_image_dict[page] = len(self.page_images)
|
|
278
|
+
self.page_images.append(img)
|
|
279
|
+
pageImages.append(img)
|
|
280
|
+
continue
|
|
281
|
+
|
|
282
|
+
try:
|
|
283
|
+
self.print("WARN", f"[{attempt}] downloading [{page}]")
|
|
284
|
+
resp = requests.get(page)
|
|
285
|
+
resp.raise_for_status()
|
|
286
|
+
img = Image.open(BytesIO(resp.content))
|
|
287
|
+
if img:
|
|
288
|
+
self.page_image_dict[page] = len(self.page_images)
|
|
289
|
+
self.page_images.append(img)
|
|
290
|
+
pageImages.append(img)
|
|
291
|
+
except Exception as e:
|
|
292
|
+
self.print(
|
|
293
|
+
"ERROR", f"[{attempt}] Failed to load image from {page}: {e}"
|
|
294
|
+
)
|
|
295
|
+
if attempt < 2:
|
|
296
|
+
if should_sleep:
|
|
297
|
+
time.sleep(2 * attempt + 1)
|
|
298
|
+
return self.load_images(
|
|
299
|
+
imgs, upload, attempt + 1, should_sleep=should_sleep
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
return pageImages
|
|
303
|
+
|
|
304
|
+
def print(self, level: str, msg: str) -> None:
|
|
305
|
+
if not self.logger:
|
|
306
|
+
print(msg)
|
|
307
|
+
return
|
|
308
|
+
|
|
309
|
+
lvl = level.upper()
|
|
310
|
+
if lvl == "ERROR":
|
|
311
|
+
self.logger.error_msg(msg, self.file_name, self.document_id, self.task_id)
|
|
312
|
+
elif lvl == "INFO":
|
|
313
|
+
self.logger.info_msg(msg, self.file_name, self.document_id, self.task_id)
|
|
314
|
+
elif lvl in ("WARN", "WARNING"):
|
|
315
|
+
self.logger.warning_msg(msg, self.file_name, self.document_id, self.task_id)
|
|
316
|
+
else:
|
|
317
|
+
self.logger.debug_msg(msg, self.file_name, self.document_id, self.task_id)
|
|
318
|
+
|
|
319
|
+
def write_debug(self, file_name: str, data: typing.Any) -> None:
|
|
320
|
+
if not self.debug_path:
|
|
321
|
+
return
|
|
322
|
+
|
|
323
|
+
os.makedirs(self.debug_path, exist_ok=True)
|
|
324
|
+
file_path = f"{self.debug_path}/{self.file_name.replace('.pdf','')}"
|
|
325
|
+
os.makedirs(file_path, exist_ok=True)
|
|
326
|
+
|
|
327
|
+
if not isinstance(data, str):
|
|
328
|
+
try:
|
|
329
|
+
data = json.dumps(data)
|
|
330
|
+
except Exception as e:
|
|
331
|
+
if isinstance(data, Exception):
|
|
332
|
+
data = str(data)
|
|
333
|
+
else:
|
|
334
|
+
self.print("ERROR", f"write_debug exception: {e}")
|
|
335
|
+
raise e
|
|
336
|
+
|
|
337
|
+
with open(f"{file_path}/{self.start}_{file_name}", "w", encoding="utf-8") as f:
|
|
338
|
+
f.write(data)
|