groundx 2.0.15__py3-none-any.whl → 2.7.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. groundx/__init__.py +73 -21
  2. groundx/buckets/__init__.py +2 -0
  3. groundx/buckets/client.py +55 -388
  4. groundx/buckets/raw_client.py +628 -0
  5. groundx/client.py +22 -21
  6. groundx/core/__init__.py +5 -0
  7. groundx/core/api_error.py +13 -5
  8. groundx/core/client_wrapper.py +4 -3
  9. groundx/core/force_multipart.py +16 -0
  10. groundx/core/http_client.py +76 -32
  11. groundx/core/http_response.py +55 -0
  12. groundx/core/jsonable_encoder.py +0 -1
  13. groundx/core/pydantic_utilities.py +71 -112
  14. groundx/core/serialization.py +7 -3
  15. groundx/csv_splitter.py +64 -0
  16. groundx/customer/__init__.py +2 -0
  17. groundx/customer/client.py +31 -43
  18. groundx/customer/raw_client.py +91 -0
  19. groundx/documents/__init__.py +1 -2
  20. groundx/documents/client.py +455 -953
  21. groundx/documents/raw_client.py +1450 -0
  22. groundx/errors/__init__.py +2 -0
  23. groundx/errors/bad_request_error.py +4 -3
  24. groundx/errors/unauthorized_error.py +4 -3
  25. groundx/extract/__init__.py +48 -0
  26. groundx/extract/agents/__init__.py +7 -0
  27. groundx/extract/agents/agent.py +202 -0
  28. groundx/extract/classes/__init__.py +24 -0
  29. groundx/extract/classes/agent.py +23 -0
  30. groundx/extract/classes/api.py +15 -0
  31. groundx/extract/classes/document.py +338 -0
  32. groundx/extract/classes/field.py +88 -0
  33. groundx/extract/classes/groundx.py +147 -0
  34. groundx/extract/classes/prompt.py +36 -0
  35. groundx/extract/classes/test_document.py +109 -0
  36. groundx/extract/classes/test_field.py +43 -0
  37. groundx/extract/classes/test_groundx.py +223 -0
  38. groundx/extract/classes/test_prompt.py +68 -0
  39. groundx/extract/post_process/__init__.py +7 -0
  40. groundx/extract/post_process/post_process.py +33 -0
  41. groundx/extract/services/.DS_Store +0 -0
  42. groundx/extract/services/__init__.py +14 -0
  43. groundx/extract/services/csv.py +76 -0
  44. groundx/extract/services/logger.py +126 -0
  45. groundx/extract/services/logging_cfg.py +53 -0
  46. groundx/extract/services/ratelimit.py +104 -0
  47. groundx/extract/services/sheets_client.py +160 -0
  48. groundx/extract/services/status.py +197 -0
  49. groundx/extract/services/upload.py +68 -0
  50. groundx/extract/services/upload_minio.py +122 -0
  51. groundx/extract/services/upload_s3.py +91 -0
  52. groundx/extract/services/utility.py +52 -0
  53. groundx/extract/settings/__init__.py +15 -0
  54. groundx/extract/settings/settings.py +212 -0
  55. groundx/extract/settings/test_settings.py +512 -0
  56. groundx/extract/tasks/__init__.py +6 -0
  57. groundx/extract/tasks/utility.py +27 -0
  58. groundx/extract/utility/__init__.py +15 -0
  59. groundx/extract/utility/classes.py +193 -0
  60. groundx/extract/utility/test_utility.py +81 -0
  61. groundx/groups/__init__.py +2 -0
  62. groundx/groups/client.py +63 -550
  63. groundx/groups/raw_client.py +901 -0
  64. groundx/health/__init__.py +2 -0
  65. groundx/health/client.py +35 -101
  66. groundx/health/raw_client.py +193 -0
  67. groundx/ingest.py +771 -0
  68. groundx/search/__init__.py +2 -0
  69. groundx/search/client.py +94 -227
  70. groundx/search/raw_client.py +442 -0
  71. groundx/search/types/__init__.py +2 -0
  72. groundx/types/__init__.py +68 -16
  73. groundx/types/bounding_box_detail.py +4 -4
  74. groundx/types/bucket_detail.py +5 -5
  75. groundx/types/bucket_list_response.py +17 -3
  76. groundx/types/bucket_response.py +3 -3
  77. groundx/types/bucket_update_detail.py +4 -4
  78. groundx/types/bucket_update_response.py +3 -3
  79. groundx/types/customer_detail.py +2 -2
  80. groundx/types/customer_response.py +3 -3
  81. groundx/types/document.py +54 -0
  82. groundx/types/document_detail.py +16 -4
  83. groundx/types/document_list_response.py +4 -4
  84. groundx/types/document_local_ingest_request.py +7 -0
  85. groundx/types/document_lookup_response.py +8 -3
  86. groundx/types/document_response.py +3 -3
  87. groundx/types/document_type.py +21 -1
  88. groundx/types/group_detail.py +4 -4
  89. groundx/types/group_list_response.py +17 -3
  90. groundx/types/group_response.py +3 -3
  91. groundx/types/health_response.py +3 -3
  92. groundx/types/health_response_health.py +3 -3
  93. groundx/types/health_service.py +5 -5
  94. groundx/types/ingest_local_document.py +25 -0
  95. groundx/types/ingest_local_document_metadata.py +51 -0
  96. groundx/types/ingest_remote_document.py +15 -6
  97. groundx/types/ingest_response.py +4 -4
  98. groundx/types/{process_status_response_ingest.py → ingest_status.py} +8 -7
  99. groundx/types/{ingest_response_ingest.py → ingest_status_light.py} +7 -5
  100. groundx/types/ingest_status_progress.py +26 -0
  101. groundx/types/{process_status_response_ingest_progress_errors.py → ingest_status_progress_cancelled.py} +4 -4
  102. groundx/types/{process_status_response_ingest_progress_complete.py → ingest_status_progress_complete.py} +4 -4
  103. groundx/types/{process_status_response_ingest_progress_cancelled.py → ingest_status_progress_errors.py} +4 -4
  104. groundx/types/{process_status_response_ingest_progress_processing.py → ingest_status_progress_processing.py} +4 -4
  105. groundx/types/message_response.py +2 -2
  106. groundx/types/meter_detail.py +2 -2
  107. groundx/types/process_level.py +5 -0
  108. groundx/types/{process_status_response.py → processes_status_response.py} +8 -5
  109. groundx/types/processing_status.py +3 -1
  110. groundx/types/search_response.py +3 -3
  111. groundx/types/search_response_search.py +3 -3
  112. groundx/types/search_result_item.py +7 -5
  113. groundx/types/search_result_item_pages_item.py +41 -0
  114. groundx/types/subscription_detail.py +3 -3
  115. groundx/types/subscription_detail_meters.py +5 -5
  116. groundx/{documents/types/website_crawl_request_websites_item.py → types/website_source.py} +7 -7
  117. groundx/types/workflow_apply_request.py +24 -0
  118. groundx/types/workflow_detail.py +59 -0
  119. groundx/types/workflow_detail_chunk_strategy.py +5 -0
  120. groundx/types/workflow_detail_relationships.py +36 -0
  121. groundx/types/workflow_engine.py +58 -0
  122. groundx/types/workflow_engine_reasoning_effort.py +5 -0
  123. groundx/types/workflow_engine_service.py +7 -0
  124. groundx/types/workflow_prompt.py +37 -0
  125. groundx/types/workflow_prompt_group.py +25 -0
  126. groundx/types/workflow_prompt_role.py +5 -0
  127. groundx/types/workflow_request.py +31 -0
  128. groundx/types/workflow_request_chunk_strategy.py +5 -0
  129. groundx/types/workflow_response.py +20 -0
  130. groundx/types/workflow_step.py +33 -0
  131. groundx/types/workflow_step_config.py +33 -0
  132. groundx/types/workflow_step_config_field.py +8 -0
  133. groundx/types/workflow_steps.py +38 -0
  134. groundx/types/workflows_response.py +20 -0
  135. groundx/workflows/__init__.py +7 -0
  136. groundx/workflows/client.py +736 -0
  137. groundx/workflows/raw_client.py +841 -0
  138. groundx/workflows/types/__init__.py +7 -0
  139. groundx/workflows/types/workflows_get_request_id.py +5 -0
  140. {groundx-2.0.15.dist-info → groundx-2.7.7.dist-info}/LICENSE +1 -1
  141. {groundx-2.0.15.dist-info → groundx-2.7.7.dist-info}/METADATA +39 -22
  142. groundx-2.7.7.dist-info/RECORD +155 -0
  143. groundx/documents/types/__init__.py +0 -6
  144. groundx/documents/types/documents_ingest_local_request_files_item.py +0 -43
  145. groundx/types/process_status_response_ingest_progress.py +0 -26
  146. groundx-2.0.15.dist-info/RECORD +0 -82
  147. {groundx-2.0.15.dist-info → groundx-2.7.7.dist-info}/WHEEL +0 -0
@@ -1,5 +1,7 @@
1
1
  # This file was auto-generated by Fern from our API Definition.
2
2
 
3
+ # isort: skip_file
4
+
3
5
  from .bad_request_error import BadRequestError
4
6
  from .unauthorized_error import UnauthorizedError
5
7
 
@@ -1,9 +1,10 @@
1
1
  # This file was auto-generated by Fern from our API Definition.
2
2
 
3
- from ..core.api_error import ApiError
4
3
  import typing
5
4
 
5
+ from ..core.api_error import ApiError
6
+
6
7
 
7
8
  class BadRequestError(ApiError):
8
- def __init__(self, body: typing.Optional[typing.Any]):
9
- super().__init__(status_code=400, body=body)
9
+ def __init__(self, body: typing.Optional[typing.Any], headers: typing.Optional[typing.Dict[str, str]] = None):
10
+ super().__init__(status_code=400, headers=headers, body=body)
@@ -1,9 +1,10 @@
1
1
  # This file was auto-generated by Fern from our API Definition.
2
2
 
3
- from ..core.api_error import ApiError
4
3
  import typing
5
4
 
5
+ from ..core.api_error import ApiError
6
+
6
7
 
7
8
  class UnauthorizedError(ApiError):
8
- def __init__(self, body: typing.Optional[typing.Any]):
9
- super().__init__(status_code=401, body=body)
9
+ def __init__(self, body: typing.Optional[typing.Any], headers: typing.Optional[typing.Dict[str, str]] = None):
10
+ super().__init__(status_code=401, headers=headers, body=body)
@@ -0,0 +1,48 @@
1
+ from .agents import AgentCode, AgentTool
2
+ from .classes import (
3
+ AgentRequest,
4
+ Document,
5
+ DocumentRequest,
6
+ ExtractedField,
7
+ GroundXDocument,
8
+ ProcessResponse,
9
+ Prompt,
10
+ TestChunk,
11
+ TestDocumentPage,
12
+ TestField,
13
+ TestXRay,
14
+ XRayDocument,
15
+ )
16
+ from .services import Logger, RateLimit, SheetsClient, Status, Upload
17
+ from .settings import (
18
+ AgentSettings,
19
+ ContainerSettings,
20
+ ContainerUploadSettings,
21
+ GroundXSettings,
22
+ )
23
+
24
+ __all__ = [
25
+ "AgentCode",
26
+ "AgentRequest",
27
+ "AgentSettings",
28
+ "AgentTool",
29
+ "ContainerSettings",
30
+ "ContainerUploadSettings",
31
+ "Document",
32
+ "DocumentRequest",
33
+ "ExtractedField",
34
+ "GroundXDocument",
35
+ "GroundXSettings",
36
+ "Logger",
37
+ "ProcessResponse",
38
+ "Prompt",
39
+ "RateLimit",
40
+ "SheetsClient",
41
+ "Status",
42
+ "TestChunk",
43
+ "TestDocumentPage",
44
+ "TestField",
45
+ "TestXRay",
46
+ "Upload",
47
+ "XRayDocument",
48
+ ]
@@ -0,0 +1,7 @@
1
+ from .agent import AgentCode, AgentTool
2
+
3
+
4
+ __all__ = [
5
+ "AgentCode",
6
+ "AgentTool",
7
+ ]
@@ -0,0 +1,202 @@
1
+ import json, pytest, traceback, typing
2
+
3
+ pytest.importorskip("PIL")
4
+
5
+ from PIL.Image import Image
6
+
7
+ from smolagents import ( # pyright: ignore[reportMissingTypeStubs]
8
+ CodeAgent,
9
+ Tool,
10
+ ToolCallingAgent,
11
+ )
12
+ from smolagents.models import ( # pyright: ignore[reportMissingTypeStubs]
13
+ OpenAIServerModel,
14
+ )
15
+
16
+ from ..services.logger import Logger
17
+ from ..settings.settings import AgentSettings
18
+ from ..utility.classes import clean_json
19
+
20
+
21
+ prompt_suffix = """
22
+ Return only your response using the `final_answer` tool format:
23
+
24
+ ```json
25
+ {{"answer": {{"type": RESPONSE_HERE, "description": "The final answer to the problem"}}}}
26
+ ```
27
+ """
28
+
29
+
30
+ def extract_response(res: typing.Dict[str, typing.Any]) -> typing.Any:
31
+ if "answer" in res and "type" in res["answer"]:
32
+ return res["answer"]["type"]
33
+
34
+ if "type" in res:
35
+ return res["type"]
36
+
37
+ return res
38
+
39
+
40
+ def process_response(
41
+ res: typing.Any,
42
+ expected_types: typing.Union[type, typing.Tuple[type, ...]] = dict,
43
+ ) -> typing.Any:
44
+ if not isinstance(res, expected_types):
45
+ if (
46
+ isinstance(res, list)
47
+ and isinstance(dict(), expected_types)
48
+ and len(res) == 1 # pyright: ignore[reportUnknownArgumentType]
49
+ ):
50
+ return extract_response(
51
+ res[0] # pyright: ignore[reportUnknownArgumentType]
52
+ )
53
+
54
+ if not isinstance(res, str):
55
+ traceback.print_stack()
56
+ raise TypeError(
57
+ f"agent process result is not of expected type(s) {expected_types!r}, got {type(res)!r}" # type: ignore
58
+ )
59
+
60
+ res = clean_json(res)
61
+
62
+ loaded = json.loads(res)
63
+ if not isinstance(loaded, expected_types):
64
+ if isinstance(loaded, list) and isinstance(dict(), expected_types) and len(loaded) == 1: # type: ignore
65
+ return extract_response(loaded[0]) # type: ignore
66
+
67
+ traceback.print_stack()
68
+ raise TypeError(
69
+ f"agent process result is not of expected type(s) {expected_types!r} after JSON parsing, got {type(loaded)!r}" # type: ignore
70
+ )
71
+
72
+ if isinstance(loaded, typing.Dict):
73
+ return extract_response(loaded) # type: ignore
74
+
75
+ return loaded
76
+
77
+ if isinstance(res, typing.Dict):
78
+ return extract_response(res) # type: ignore
79
+
80
+ return res
81
+
82
+
83
+ class AgentCode(CodeAgent):
84
+ def __init__(
85
+ self,
86
+ settings: AgentSettings,
87
+ log: Logger,
88
+ name: typing.Optional[str] = None,
89
+ description: typing.Optional[str] = None,
90
+ tools: typing.Optional[typing.List[Tool]] = None,
91
+ verbosity: typing.Optional[int] = 0,
92
+ ):
93
+ if tools is None:
94
+ tools = []
95
+
96
+ model = OpenAIServerModel(
97
+ model_id=settings.model_id,
98
+ api_base=settings.api_base,
99
+ api_key=settings.get_api_key(),
100
+ )
101
+
102
+ super().__init__( # pyright: ignore[reportUnknownMemberType]
103
+ name=name,
104
+ description=description,
105
+ additional_authorized_imports=settings.imports,
106
+ tools=tools,
107
+ model=model,
108
+ max_steps=settings.max_steps,
109
+ verbosity_level=verbosity,
110
+ )
111
+
112
+ if self.python_executor.static_tools is None: # type: ignore
113
+ self.python_executor.static_tools = {} # type: ignore
114
+
115
+ self.python_executor.static_tools.update({"open": open}) # type: ignore
116
+
117
+ self.log = log
118
+
119
+ def process(
120
+ self,
121
+ conflict: str,
122
+ images: typing.List[Image],
123
+ expected_types: typing.Union[type, typing.Tuple[type, ...]] = dict,
124
+ attempt: int = 0,
125
+ ) -> typing.Any:
126
+ res = super().run( # pyright: ignore[reportUnknownMemberType]
127
+ conflict + prompt_suffix,
128
+ images=images,
129
+ )
130
+
131
+ try:
132
+ return process_response(res=res, expected_types=expected_types)
133
+
134
+ except Exception as e:
135
+ if attempt > 2:
136
+ raise TypeError(
137
+ f"agent process result is not of expected type(s) {expected_types!r}: [{e}]\n\n{res}"
138
+ )
139
+
140
+ self.log.debug_msg(
141
+ f"agent process result is not of expected type(s) {expected_types!r}: [{e}], attempting again [{attempt+1}]\n\n{res}"
142
+ )
143
+
144
+ return self.process(conflict, images, expected_types, attempt + 1)
145
+
146
+
147
+ class AgentTool(ToolCallingAgent):
148
+ def __init__(
149
+ self,
150
+ settings: AgentSettings,
151
+ log: Logger,
152
+ name: typing.Optional[str] = None,
153
+ description: typing.Optional[str] = None,
154
+ tools: typing.Optional[typing.List[Tool]] = None,
155
+ verbosity: typing.Optional[int] = 0,
156
+ ):
157
+ if tools is None:
158
+ tools = []
159
+
160
+ model = OpenAIServerModel(
161
+ model_id=settings.model_id,
162
+ api_base=settings.api_base,
163
+ api_key=settings.get_api_key(),
164
+ )
165
+
166
+ super().__init__( # pyright: ignore[reportUnknownMemberType]
167
+ name=name,
168
+ description=description,
169
+ tools=tools,
170
+ model=model,
171
+ max_steps=settings.max_steps,
172
+ verbosity_level=verbosity,
173
+ )
174
+
175
+ self.log = log
176
+
177
+ def process(
178
+ self,
179
+ conflict: str,
180
+ images: typing.List[Image],
181
+ expected_types: typing.Union[type, typing.Tuple[type, ...]] = dict,
182
+ attempt: int = 0,
183
+ ) -> typing.Any:
184
+ res = super().run( # pyright: ignore[reportUnknownMemberType]
185
+ conflict + prompt_suffix,
186
+ images=images,
187
+ )
188
+
189
+ try:
190
+ return process_response(res=res, expected_types=expected_types)
191
+
192
+ except Exception as e:
193
+ if attempt > 2:
194
+ raise TypeError(
195
+ f"agent process result is not of expected type(s) {expected_types!r}: [{e}]\n\n{res}"
196
+ )
197
+
198
+ print(
199
+ f"agent process result is not of expected type(s) {expected_types!r}: [{e}], attempting again [{attempt+1}]\n\n{res}"
200
+ )
201
+
202
+ return self.process(conflict, images, expected_types, attempt + 1)
@@ -0,0 +1,24 @@
1
+ from .agent import AgentRequest
2
+ from .api import ProcessResponse
3
+ from .document import Document, DocumentRequest
4
+ from .field import ExtractedField
5
+ from .groundx import GroundXDocument, XRayDocument
6
+ from .prompt import Prompt
7
+ from .test_field import TestField
8
+ from .test_groundx import TestChunk, TestDocumentPage, TestXRay
9
+
10
+
11
+ __all__ = [
12
+ "AgentRequest",
13
+ "Document",
14
+ "DocumentRequest",
15
+ "ExtractedField",
16
+ "GroundXDocument",
17
+ "ProcessResponse",
18
+ "Prompt",
19
+ "TestChunk",
20
+ "TestDocumentPage",
21
+ "TestField",
22
+ "TestXRay",
23
+ "XRayDocument",
24
+ ]
@@ -0,0 +1,23 @@
1
+ import typing
2
+ from pydantic import BaseModel, field_validator
3
+
4
+ from .document import Document, DocumentRequest
5
+
6
+ ReqT = typing.TypeVar("ReqT", bound=DocumentRequest)
7
+ DocT = typing.TypeVar("DocT", bound=Document)
8
+
9
+
10
+ class AgentRequest(BaseModel, typing.Generic[ReqT, DocT]):
11
+ allowed_request_types: typing.ClassVar[typing.List[str]] = []
12
+ request: ReqT
13
+ request_type: str
14
+ statement: DocT
15
+
16
+ @field_validator("request_type")
17
+ @classmethod
18
+ def validate_request_type(cls, value: str):
19
+ if value not in cls.allowed_request_types:
20
+ raise ValueError(
21
+ f"Invalid request_type '{value}'. Must be one of {cls.allowed_request_types}"
22
+ )
23
+ return value
@@ -0,0 +1,15 @@
1
+ from dataclasses import dataclass
2
+ from pydantic import BaseModel, ConfigDict, Field
3
+
4
+
5
+ class ErrorResponse(BaseModel):
6
+ model_config = ConfigDict(populate_by_name=True)
7
+ code: int
8
+ document_id: str = Field(alias="documentID")
9
+ message: str
10
+ task_id: str = Field(alias="taskID")
11
+
12
+
13
+ @dataclass
14
+ class ProcessResponse:
15
+ message: str
@@ -0,0 +1,338 @@
1
+ import json, os, shutil, requests, time, typing
2
+ from datetime import datetime, timezone
3
+ from io import BytesIO
4
+ from pathlib import Path
5
+ from PIL import Image
6
+ from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
7
+ from urllib.parse import urlparse
8
+
9
+ from .groundx import GroundXDocument
10
+ from ..services.logger import Logger
11
+ from ..services.upload import Upload
12
+ from ..utility.classes import clean_json
13
+
14
+
15
+ DocT = typing.TypeVar("DocT", bound="Document")
16
+
17
+
18
+ class Document(BaseModel):
19
+ file_name: str = ""
20
+
21
+ document_id: str = ""
22
+ page_images: typing.List[str] = []
23
+ source_url: str = ""
24
+ task_id: str = ""
25
+
26
+ _logger: typing.Optional[Logger] = PrivateAttr(default=None)
27
+
28
+ @property
29
+ def logger(self) -> typing.Optional[Logger]:
30
+ if self._logger:
31
+ return self._logger
32
+
33
+ return None
34
+
35
+ @logger.setter
36
+ def logger(self, value: Logger) -> None:
37
+ self._logger = value
38
+
39
+ @logger.deleter
40
+ def logger(self) -> None:
41
+ del self._logger
42
+
43
+ @classmethod
44
+ def from_request(
45
+ cls: typing.Type[DocT],
46
+ base_url: str,
47
+ cache_dir: Path,
48
+ req: "DocumentRequest",
49
+ upload: typing.Optional[Upload] = None,
50
+ **data: typing.Any,
51
+ ) -> DocT:
52
+ st = cls(**data)
53
+
54
+ st.document_id = req.document_id
55
+ st.file_name = req.file_name
56
+ st.task_id = req.task_id
57
+
58
+ xray_doc = GroundXDocument(
59
+ base_url=base_url,
60
+ documentID=req.document_id,
61
+ taskID=req.task_id,
62
+ ).xray(upload=upload, cache_dir=cache_dir, clear_cache=req.clear_cache)
63
+
64
+ for page in xray_doc.documentPages:
65
+ st.page_images.append(page.pageUrl)
66
+
67
+ st.source_url = xray_doc.sourceUrl
68
+
69
+ for chunk in xray_doc.chunks:
70
+ stxt = chunk.sectionSummary or "{}"
71
+ stxt = clean_json(stxt)
72
+ try:
73
+ data = json.loads(stxt)
74
+ except json.JSONDecodeError:
75
+ st.print("ERROR", f"\njson.JSONDecodeError stxt\n{stxt}\n\n")
76
+ continue
77
+
78
+ for key, value in data.items():
79
+ err = st.add(key, value)
80
+ if err:
81
+ raise Exception(f"\n\ninit document error:\n\t{err}\n")
82
+
83
+ mtxt = chunk.suggestedText or "{}"
84
+ mtxt = clean_json(mtxt)
85
+ try:
86
+ data = json.loads(mtxt)
87
+ except json.JSONDecodeError:
88
+ st.print("ERROR", f"\njson.JSONDecodeError mtxt\n{mtxt}\n\n")
89
+ continue
90
+
91
+ for key, value in data.items():
92
+ err = st.add(key, value)
93
+ if err:
94
+ raise Exception(f"\n\ninit document error:\n\t{err}\n")
95
+
96
+ st.finalize_init()
97
+
98
+ return st
99
+
100
+ def add(self, k: str, value: typing.Any) -> typing.Union[str, None]:
101
+ self.print("WARNING", "add is not implemented")
102
+
103
+ return None
104
+
105
+ def finalize_init(self) -> None:
106
+ self.print("WARNING", "finalize_init is not implemented")
107
+
108
+ def print(self, level: str, msg: str) -> None:
109
+ if not self.logger:
110
+ print(msg)
111
+ return
112
+
113
+ lvl = level.upper()
114
+ if lvl == "ERROR":
115
+ self.logger.error_msg(msg, self.file_name, self.document_id, self.task_id)
116
+ elif lvl == "INFO":
117
+ self.logger.info_msg(msg, self.file_name, self.document_id, self.task_id)
118
+ elif lvl in ("WARN", "WARNING"):
119
+ self.logger.warning_msg(msg, self.file_name, self.document_id, self.task_id)
120
+ else:
121
+ self.logger.debug_msg(msg, self.file_name, self.document_id, self.task_id)
122
+
123
+
124
+ def _new_page_image_dict() -> typing.Dict[str, int]:
125
+ return {}
126
+
127
+
128
+ def _new_page_images() -> typing.List[Image.Image]:
129
+ return []
130
+
131
+
132
+ class DocumentRequest(BaseModel):
133
+ model_config = ConfigDict(populate_by_name=True)
134
+ callback_url: str = Field(alias="callbackURL", default="")
135
+ document_id: str = Field(alias="documentID")
136
+ file_name: str = Field(alias="fileName")
137
+ model_id: int = Field(alias="modelID")
138
+ processor_id: int = Field(alias="processorID")
139
+ task_id: str = Field(alias="taskID")
140
+
141
+ _logger: typing.Optional[Logger] = PrivateAttr(default=None)
142
+
143
+ _append_values: bool = PrivateAttr(default_factory=bool)
144
+ _clear_cache: bool = PrivateAttr(default_factory=bool)
145
+ _debug_path: typing.Optional[str] = PrivateAttr(default=None)
146
+ _page_image_dict: typing.Dict[str, int] = PrivateAttr(
147
+ default_factory=_new_page_image_dict
148
+ )
149
+ _page_images: typing.List[Image.Image] = PrivateAttr(
150
+ default_factory=_new_page_images
151
+ )
152
+ _start: int = PrivateAttr(
153
+ default_factory=lambda: int(datetime.now(timezone.utc).timestamp())
154
+ )
155
+ _write_lock: typing.Optional[typing.Any] = PrivateAttr(default=None)
156
+
157
+ @property
158
+ def append_values(self) -> bool:
159
+ return self._append_values
160
+
161
+ @append_values.setter
162
+ def append_values(self, value: bool) -> None:
163
+ self._append_values = value
164
+
165
+ @append_values.deleter
166
+ def append_values(self) -> None:
167
+ del self._append_values
168
+
169
+ @property
170
+ def clear_cache(self) -> bool:
171
+ return self._clear_cache
172
+
173
+ @clear_cache.setter
174
+ def clear_cache(self, value: bool) -> None:
175
+ self._clear_cache = value
176
+
177
+ @clear_cache.deleter
178
+ def clear_cache(self) -> None:
179
+ del self._clear_cache
180
+
181
+ @property
182
+ def debug_path(self) -> typing.Optional[str]:
183
+ return self._debug_path
184
+
185
+ @debug_path.setter
186
+ def debug_path(self, value: str) -> None:
187
+ self._debug_path = value
188
+
189
+ @debug_path.deleter
190
+ def debug_path(self) -> None:
191
+ del self._debug_path
192
+
193
+ @property
194
+ def logger(self) -> typing.Optional[Logger]:
195
+ if self._logger:
196
+ return self._logger
197
+
198
+ return None
199
+
200
+ @logger.setter
201
+ def logger(self, value: Logger) -> None:
202
+ self._logger = value
203
+
204
+ @logger.deleter
205
+ def logger(self) -> None:
206
+ del self._logger
207
+
208
+ @property
209
+ def page_images(self) -> typing.List[Image.Image]:
210
+ return self._page_images
211
+
212
+ @page_images.setter
213
+ def page_images(self, value: typing.List[Image.Image]) -> None:
214
+ self._page_images = value
215
+
216
+ @page_images.deleter
217
+ def page_images(self) -> None:
218
+ del self._page_images
219
+
220
+ @property
221
+ def page_image_dict(self) -> typing.Dict[str, int]:
222
+ return self._page_image_dict
223
+
224
+ @page_image_dict.setter
225
+ def page_image_dict(self, value: typing.Dict[str, int]) -> None:
226
+ self._page_image_dict = value
227
+
228
+ @page_image_dict.deleter
229
+ def page_image_dict(self) -> None:
230
+ del self._page_image_dict
231
+
232
+ @property
233
+ def start(self) -> int:
234
+ return self._start
235
+
236
+ @property
237
+ def write_lock(self) -> typing.Optional[typing.Any]:
238
+ return self._write_lock
239
+
240
+ @write_lock.setter
241
+ def write_lock(self, value: typing.Optional[typing.Any]) -> None:
242
+ self._write_lock = value
243
+
244
+ @write_lock.deleter
245
+ def write_lock(self) -> None:
246
+ del self._write_lock
247
+
248
+ def clear_debug(self) -> None:
249
+ if self.debug_path:
250
+ file_path = f"{self.debug_path}/{self.file_name.replace('.pdf','')}"
251
+ shutil.rmtree(file_path, ignore_errors=True)
252
+
253
+ def load_images(
254
+ self,
255
+ imgs: typing.List[str],
256
+ upload: typing.Optional[Upload] = None,
257
+ attempt: int = 0,
258
+ should_sleep: bool = True,
259
+ ) -> typing.List[Image.Image]:
260
+ pageImages: typing.List[Image.Image] = []
261
+ for page in imgs:
262
+ if page in self.page_image_dict:
263
+ self.print(
264
+ "WARN",
265
+ f"[{attempt}] loading cached [{self.page_image_dict[page]}] [{page}]",
266
+ )
267
+ pageImages.append(self.page_images[self.page_image_dict[page]])
268
+ continue
269
+
270
+ if upload:
271
+ parsed = urlparse(page)
272
+ path = parsed.path + ("?" + parsed.query if parsed.query else "")
273
+ ru = upload.get_object(path)
274
+ if ru:
275
+ img = Image.open(BytesIO(ru))
276
+ if img:
277
+ self.page_image_dict[page] = len(self.page_images)
278
+ self.page_images.append(img)
279
+ pageImages.append(img)
280
+ continue
281
+
282
+ try:
283
+ self.print("WARN", f"[{attempt}] downloading [{page}]")
284
+ resp = requests.get(page)
285
+ resp.raise_for_status()
286
+ img = Image.open(BytesIO(resp.content))
287
+ if img:
288
+ self.page_image_dict[page] = len(self.page_images)
289
+ self.page_images.append(img)
290
+ pageImages.append(img)
291
+ except Exception as e:
292
+ self.print(
293
+ "ERROR", f"[{attempt}] Failed to load image from {page}: {e}"
294
+ )
295
+ if attempt < 2:
296
+ if should_sleep:
297
+ time.sleep(2 * attempt + 1)
298
+ return self.load_images(
299
+ imgs, upload, attempt + 1, should_sleep=should_sleep
300
+ )
301
+
302
+ return pageImages
303
+
304
+ def print(self, level: str, msg: str) -> None:
305
+ if not self.logger:
306
+ print(msg)
307
+ return
308
+
309
+ lvl = level.upper()
310
+ if lvl == "ERROR":
311
+ self.logger.error_msg(msg, self.file_name, self.document_id, self.task_id)
312
+ elif lvl == "INFO":
313
+ self.logger.info_msg(msg, self.file_name, self.document_id, self.task_id)
314
+ elif lvl in ("WARN", "WARNING"):
315
+ self.logger.warning_msg(msg, self.file_name, self.document_id, self.task_id)
316
+ else:
317
+ self.logger.debug_msg(msg, self.file_name, self.document_id, self.task_id)
318
+
319
+ def write_debug(self, file_name: str, data: typing.Any) -> None:
320
+ if not self.debug_path:
321
+ return
322
+
323
+ os.makedirs(self.debug_path, exist_ok=True)
324
+ file_path = f"{self.debug_path}/{self.file_name.replace('.pdf','')}"
325
+ os.makedirs(file_path, exist_ok=True)
326
+
327
+ if not isinstance(data, str):
328
+ try:
329
+ data = json.dumps(data)
330
+ except Exception as e:
331
+ if isinstance(data, Exception):
332
+ data = str(data)
333
+ else:
334
+ self.print("ERROR", f"write_debug exception: {e}")
335
+ raise e
336
+
337
+ with open(f"{file_path}/{self.start}_{file_name}", "w", encoding="utf-8") as f:
338
+ f.write(data)