groundx 2.4.4__py3-none-any.whl → 2.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of groundx might be problematic. Click here for more details.
- groundx/core/client_wrapper.py +2 -2
- groundx/extract/__init__.py +38 -0
- groundx/extract/agents/__init__.py +7 -0
- groundx/extract/agents/agent.py +202 -0
- groundx/extract/classes/__init__.py +27 -0
- groundx/extract/classes/agent.py +22 -0
- groundx/extract/classes/api.py +15 -0
- groundx/extract/classes/document.py +311 -0
- groundx/extract/classes/field.py +88 -0
- groundx/extract/classes/groundx.py +123 -0
- groundx/extract/classes/post_process.py +33 -0
- groundx/extract/classes/prompt.py +36 -0
- groundx/extract/classes/settings.py +169 -0
- groundx/extract/classes/test_document.py +126 -0
- groundx/extract/classes/test_field.py +43 -0
- groundx/extract/classes/test_groundx.py +188 -0
- groundx/extract/classes/test_prompt.py +68 -0
- groundx/extract/classes/test_settings.py +515 -0
- groundx/extract/classes/test_utility.py +81 -0
- groundx/extract/classes/utility.py +193 -0
- groundx/extract/services/.DS_Store +0 -0
- groundx/extract/services/__init__.py +14 -0
- groundx/extract/services/csv.py +76 -0
- groundx/extract/services/logger.py +127 -0
- groundx/extract/services/logging_cfg.py +55 -0
- groundx/extract/services/ratelimit.py +104 -0
- groundx/extract/services/sheets_client.py +160 -0
- groundx/extract/services/status.py +197 -0
- groundx/extract/services/upload.py +73 -0
- groundx/extract/services/upload_minio.py +122 -0
- groundx/extract/services/upload_s3.py +84 -0
- groundx/extract/services/utility.py +52 -0
- {groundx-2.4.4.dist-info → groundx-2.4.9.dist-info}/METADATA +1 -1
- {groundx-2.4.4.dist-info → groundx-2.4.9.dist-info}/RECORD +36 -5
- {groundx-2.4.4.dist-info → groundx-2.4.9.dist-info}/LICENSE +0 -0
- {groundx-2.4.4.dist-info → groundx-2.4.9.dist-info}/WHEEL +0 -0
groundx/core/client_wrapper.py
CHANGED
|
@@ -14,10 +14,10 @@ class BaseClientWrapper:
|
|
|
14
14
|
|
|
15
15
|
def get_headers(self) -> typing.Dict[str, str]:
|
|
16
16
|
headers: typing.Dict[str, str] = {
|
|
17
|
-
"User-Agent": "groundx/2.4.
|
|
17
|
+
"User-Agent": "groundx/2.4.9",
|
|
18
18
|
"X-Fern-Language": "Python",
|
|
19
19
|
"X-Fern-SDK-Name": "groundx",
|
|
20
|
-
"X-Fern-SDK-Version": "2.4.
|
|
20
|
+
"X-Fern-SDK-Version": "2.4.9",
|
|
21
21
|
}
|
|
22
22
|
headers["X-API-Key"] = self.api_key
|
|
23
23
|
return headers
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from .agents import AgentCode, AgentTool
|
|
2
|
+
from .classes import (
|
|
3
|
+
AgentRequest,
|
|
4
|
+
AgentSettings,
|
|
5
|
+
ContainerSettings,
|
|
6
|
+
ContainerUploadSettings,
|
|
7
|
+
Document,
|
|
8
|
+
DocumentRequest,
|
|
9
|
+
ExtractedField,
|
|
10
|
+
GroundXDocument,
|
|
11
|
+
GroundXSettings,
|
|
12
|
+
ProcessResponse,
|
|
13
|
+
Prompt,
|
|
14
|
+
XRayDocument,
|
|
15
|
+
)
|
|
16
|
+
from .services import Logger, RateLimit, SheetsClient, Status, Upload
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"AgentCode",
|
|
20
|
+
"AgentRequest",
|
|
21
|
+
"AgentSettings",
|
|
22
|
+
"AgentTool",
|
|
23
|
+
"ContainerSettings",
|
|
24
|
+
"ContainerUploadSettings",
|
|
25
|
+
"Document",
|
|
26
|
+
"DocumentRequest",
|
|
27
|
+
"ExtractedField",
|
|
28
|
+
"GroundXDocument",
|
|
29
|
+
"GroundXSettings",
|
|
30
|
+
"Logger",
|
|
31
|
+
"ProcessResponse",
|
|
32
|
+
"Prompt",
|
|
33
|
+
"RateLimit",
|
|
34
|
+
"SheetsClient",
|
|
35
|
+
"Status",
|
|
36
|
+
"Upload",
|
|
37
|
+
"XRayDocument",
|
|
38
|
+
]
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
import json, pytest, traceback, typing
|
|
2
|
+
|
|
3
|
+
pytest.importorskip("PIL")
|
|
4
|
+
|
|
5
|
+
from PIL.Image import Image
|
|
6
|
+
|
|
7
|
+
from smolagents import ( # pyright: ignore[reportMissingTypeStubs]
|
|
8
|
+
CodeAgent,
|
|
9
|
+
Tool,
|
|
10
|
+
ToolCallingAgent,
|
|
11
|
+
)
|
|
12
|
+
from smolagents.models import ( # pyright: ignore[reportMissingTypeStubs]
|
|
13
|
+
OpenAIServerModel,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
from ..classes.settings import AgentSettings
|
|
17
|
+
from ..classes.utility import clean_json
|
|
18
|
+
from ..services.logger import Logger
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
prompt_suffix = """
|
|
22
|
+
Return only your response using the `final_answer` tool format:
|
|
23
|
+
|
|
24
|
+
```json
|
|
25
|
+
{{"answer": {{"type": RESPONSE_HERE, "description": "The final answer to the problem"}}}}
|
|
26
|
+
```
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def extract_response(res: typing.Dict[str, typing.Any]) -> typing.Any:
|
|
31
|
+
if "answer" in res and "type" in res["answer"]:
|
|
32
|
+
return res["answer"]["type"]
|
|
33
|
+
|
|
34
|
+
if "type" in res:
|
|
35
|
+
return res["type"]
|
|
36
|
+
|
|
37
|
+
return res
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def process_response(
|
|
41
|
+
res: typing.Any,
|
|
42
|
+
expected_types: typing.Union[type, typing.Tuple[type, ...]] = dict,
|
|
43
|
+
) -> typing.Any:
|
|
44
|
+
if not isinstance(res, expected_types):
|
|
45
|
+
if (
|
|
46
|
+
isinstance(res, list)
|
|
47
|
+
and isinstance(dict(), expected_types)
|
|
48
|
+
and len(res) == 1 # pyright: ignore[reportUnknownArgumentType]
|
|
49
|
+
):
|
|
50
|
+
return extract_response(
|
|
51
|
+
res[0] # pyright: ignore[reportUnknownArgumentType]
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
if not isinstance(res, str):
|
|
55
|
+
traceback.print_stack()
|
|
56
|
+
raise TypeError(
|
|
57
|
+
f"agent process result is not of expected type(s) {expected_types!r}, got {type(res)!r}" # type: ignore
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
res = clean_json(res)
|
|
61
|
+
|
|
62
|
+
loaded = json.loads(res)
|
|
63
|
+
if not isinstance(loaded, expected_types):
|
|
64
|
+
if isinstance(loaded, list) and isinstance(dict(), expected_types) and len(loaded) == 1: # type: ignore
|
|
65
|
+
return extract_response(loaded[0]) # type: ignore
|
|
66
|
+
|
|
67
|
+
traceback.print_stack()
|
|
68
|
+
raise TypeError(
|
|
69
|
+
f"agent process result is not of expected type(s) {expected_types!r} after JSON parsing, got {type(loaded)!r}" # type: ignore
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
if isinstance(loaded, typing.Dict):
|
|
73
|
+
return extract_response(loaded) # type: ignore
|
|
74
|
+
|
|
75
|
+
return loaded
|
|
76
|
+
|
|
77
|
+
if isinstance(res, typing.Dict):
|
|
78
|
+
return extract_response(res) # type: ignore
|
|
79
|
+
|
|
80
|
+
return res
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class AgentCode(CodeAgent):
|
|
84
|
+
def __init__(
|
|
85
|
+
self,
|
|
86
|
+
settings: AgentSettings,
|
|
87
|
+
logger: Logger,
|
|
88
|
+
name: typing.Optional[str] = None,
|
|
89
|
+
description: typing.Optional[str] = None,
|
|
90
|
+
tools: typing.Optional[typing.List[Tool]] = None,
|
|
91
|
+
verbosity: typing.Optional[int] = 0,
|
|
92
|
+
):
|
|
93
|
+
if tools is None:
|
|
94
|
+
tools = []
|
|
95
|
+
|
|
96
|
+
model = OpenAIServerModel(
|
|
97
|
+
model_id=settings.model_id,
|
|
98
|
+
api_base=settings.api_base,
|
|
99
|
+
api_key=settings.get_api_key(),
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
super().__init__( # pyright: ignore[reportUnknownMemberType]
|
|
103
|
+
name=name,
|
|
104
|
+
description=description,
|
|
105
|
+
additional_authorized_imports=settings.imports,
|
|
106
|
+
tools=tools,
|
|
107
|
+
model=model,
|
|
108
|
+
max_steps=settings.max_steps,
|
|
109
|
+
verbosity_level=verbosity,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
if self.python_executor.static_tools is None: # type: ignore
|
|
113
|
+
self.python_executor.static_tools = {} # type: ignore
|
|
114
|
+
|
|
115
|
+
self.python_executor.static_tools.update({"open": open}) # type: ignore
|
|
116
|
+
|
|
117
|
+
self.logger = logger
|
|
118
|
+
|
|
119
|
+
def process(
|
|
120
|
+
self,
|
|
121
|
+
conflict: str,
|
|
122
|
+
images: typing.List[Image],
|
|
123
|
+
expected_types: typing.Union[type, typing.Tuple[type, ...]] = dict,
|
|
124
|
+
attempt: int = 0,
|
|
125
|
+
) -> typing.Any:
|
|
126
|
+
res = super().run( # pyright: ignore[reportUnknownMemberType]
|
|
127
|
+
conflict + prompt_suffix,
|
|
128
|
+
images=images,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
try:
|
|
132
|
+
return process_response(res=res, expected_types=expected_types)
|
|
133
|
+
|
|
134
|
+
except Exception as e:
|
|
135
|
+
if attempt > 2:
|
|
136
|
+
raise TypeError(
|
|
137
|
+
f"agent process result is not of expected type(s) {expected_types!r}: [{e}]\n\n{res}"
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
self.logger.debug_msg(
|
|
141
|
+
f"agent process result is not of expected type(s) {expected_types!r}: [{e}], attempting again [{attempt+1}]\n\n{res}"
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
return self.process(conflict, images, expected_types, attempt + 1)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class AgentTool(ToolCallingAgent):
|
|
148
|
+
def __init__(
|
|
149
|
+
self,
|
|
150
|
+
settings: AgentSettings,
|
|
151
|
+
logger: Logger,
|
|
152
|
+
name: typing.Optional[str] = None,
|
|
153
|
+
description: typing.Optional[str] = None,
|
|
154
|
+
tools: typing.Optional[typing.List[Tool]] = None,
|
|
155
|
+
verbosity: typing.Optional[int] = 0,
|
|
156
|
+
):
|
|
157
|
+
if tools is None:
|
|
158
|
+
tools = []
|
|
159
|
+
|
|
160
|
+
model = OpenAIServerModel(
|
|
161
|
+
model_id=settings.model_id,
|
|
162
|
+
api_base=settings.api_base,
|
|
163
|
+
api_key=settings.get_api_key(),
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
super().__init__( # pyright: ignore[reportUnknownMemberType]
|
|
167
|
+
name=name,
|
|
168
|
+
description=description,
|
|
169
|
+
tools=tools,
|
|
170
|
+
model=model,
|
|
171
|
+
max_steps=settings.max_steps,
|
|
172
|
+
verbosity_level=verbosity,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
self.logger = logger
|
|
176
|
+
|
|
177
|
+
def process(
|
|
178
|
+
self,
|
|
179
|
+
conflict: str,
|
|
180
|
+
images: typing.List[Image],
|
|
181
|
+
expected_types: typing.Union[type, typing.Tuple[type, ...]] = dict,
|
|
182
|
+
attempt: int = 0,
|
|
183
|
+
) -> typing.Any:
|
|
184
|
+
res = super().run( # pyright: ignore[reportUnknownMemberType]
|
|
185
|
+
conflict + prompt_suffix,
|
|
186
|
+
images=images,
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
try:
|
|
190
|
+
return process_response(res=res, expected_types=expected_types)
|
|
191
|
+
|
|
192
|
+
except Exception as e:
|
|
193
|
+
if attempt > 2:
|
|
194
|
+
raise TypeError(
|
|
195
|
+
f"agent process result is not of expected type(s) {expected_types!r}: [{e}]\n\n{res}"
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
print(
|
|
199
|
+
f"agent process result is not of expected type(s) {expected_types!r}: [{e}], attempting again [{attempt+1}]\n\n{res}"
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
return self.process(conflict, images, expected_types, attempt + 1)
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from .agent import AgentRequest
|
|
2
|
+
from .api import ProcessResponse
|
|
3
|
+
from .document import Document, DocumentRequest
|
|
4
|
+
from .field import ExtractedField
|
|
5
|
+
from .groundx import GroundXDocument, XRayDocument
|
|
6
|
+
from .prompt import Prompt
|
|
7
|
+
from .settings import (
|
|
8
|
+
AgentSettings,
|
|
9
|
+
ContainerSettings,
|
|
10
|
+
ContainerUploadSettings,
|
|
11
|
+
GroundXSettings,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"AgentRequest",
|
|
16
|
+
"AgentSettings",
|
|
17
|
+
"ContainerSettings",
|
|
18
|
+
"ContainerUploadSettings",
|
|
19
|
+
"Document",
|
|
20
|
+
"DocumentRequest",
|
|
21
|
+
"ExtractedField",
|
|
22
|
+
"GroundXDocument",
|
|
23
|
+
"GroundXSettings",
|
|
24
|
+
"ProcessResponse",
|
|
25
|
+
"Prompt",
|
|
26
|
+
"XRayDocument",
|
|
27
|
+
]
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import typing
|
|
2
|
+
from pydantic import BaseModel, field_validator
|
|
3
|
+
|
|
4
|
+
from .document import Document, DocumentRequest
|
|
5
|
+
|
|
6
|
+
ReqT = typing.TypeVar("ReqT", bound=DocumentRequest)
|
|
7
|
+
DocT = typing.TypeVar("DocT", bound=Document)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AgentRequest(BaseModel, typing.Generic[ReqT, DocT]):
|
|
11
|
+
allowed_request_types: typing.List[str] = []
|
|
12
|
+
request: ReqT
|
|
13
|
+
request_type: str
|
|
14
|
+
statement: DocT
|
|
15
|
+
|
|
16
|
+
@field_validator("request_type")
|
|
17
|
+
def validate_request_type(cls, value: str):
|
|
18
|
+
if value not in cls.allowed_request_types:
|
|
19
|
+
raise ValueError(
|
|
20
|
+
f"Invalid request_type '{value}'. Must be one of {cls.allowed_request_types}"
|
|
21
|
+
)
|
|
22
|
+
return value
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ErrorResponse(BaseModel):
|
|
6
|
+
model_config = ConfigDict(populate_by_name=True)
|
|
7
|
+
code: int
|
|
8
|
+
document_id: str = Field(alias="documentID")
|
|
9
|
+
message: str
|
|
10
|
+
task_id: str = Field(alias="taskID")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class ProcessResponse:
|
|
15
|
+
message: str
|
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
import json, os, shutil, requests, time, typing
|
|
2
|
+
from datetime import datetime, timezone
|
|
3
|
+
from io import BytesIO
|
|
4
|
+
from PIL import Image
|
|
5
|
+
from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
|
|
6
|
+
|
|
7
|
+
from .groundx import GroundXDocument
|
|
8
|
+
from ..services.logger import Logger
|
|
9
|
+
from .utility import clean_json
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
DocT = typing.TypeVar("DocT", bound="Document")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Document(BaseModel):
|
|
16
|
+
file_name: str = ""
|
|
17
|
+
|
|
18
|
+
document_id: str = ""
|
|
19
|
+
page_images: typing.List[str] = []
|
|
20
|
+
source_url: str = ""
|
|
21
|
+
task_id: str = ""
|
|
22
|
+
|
|
23
|
+
_logger: typing.Optional[Logger] = PrivateAttr(default=None)
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def logger(self) -> typing.Optional[Logger]:
|
|
27
|
+
if self._logger:
|
|
28
|
+
return self._logger
|
|
29
|
+
|
|
30
|
+
return None
|
|
31
|
+
|
|
32
|
+
@logger.setter
|
|
33
|
+
def logger(self, value: Logger) -> None:
|
|
34
|
+
self._logger = value
|
|
35
|
+
|
|
36
|
+
@logger.deleter
|
|
37
|
+
def logger(self) -> None:
|
|
38
|
+
del self._logger
|
|
39
|
+
|
|
40
|
+
@classmethod
|
|
41
|
+
def from_request(
|
|
42
|
+
cls: typing.Type[DocT],
|
|
43
|
+
base_url: str,
|
|
44
|
+
req: "DocumentRequest",
|
|
45
|
+
**data: typing.Any,
|
|
46
|
+
) -> DocT:
|
|
47
|
+
st = cls(**data)
|
|
48
|
+
|
|
49
|
+
st.document_id = req.document_id
|
|
50
|
+
st.file_name = req.file_name
|
|
51
|
+
st.task_id = req.task_id
|
|
52
|
+
|
|
53
|
+
xray_doc = GroundXDocument(
|
|
54
|
+
base_url=base_url,
|
|
55
|
+
documentID=req.document_id,
|
|
56
|
+
taskID=req.task_id,
|
|
57
|
+
).xray(clear_cache=req.clear_cache)
|
|
58
|
+
|
|
59
|
+
for page in xray_doc.documentPages:
|
|
60
|
+
st.page_images.append(page.pageUrl)
|
|
61
|
+
|
|
62
|
+
st.source_url = xray_doc.sourceUrl
|
|
63
|
+
|
|
64
|
+
for chunk in xray_doc.chunks:
|
|
65
|
+
stxt = chunk.sectionSummary or "{}"
|
|
66
|
+
stxt = clean_json(stxt)
|
|
67
|
+
try:
|
|
68
|
+
data = json.loads(stxt)
|
|
69
|
+
except json.JSONDecodeError:
|
|
70
|
+
st.print("ERROR", f"\njson.JSONDecodeError stxt\n{stxt}\n\n")
|
|
71
|
+
continue
|
|
72
|
+
|
|
73
|
+
for key, value in data.items():
|
|
74
|
+
err = st.add(key, value)
|
|
75
|
+
if err:
|
|
76
|
+
raise Exception(f"\n\ninit document error:\n\t{err}\n")
|
|
77
|
+
|
|
78
|
+
mtxt = chunk.suggestedText or "{}"
|
|
79
|
+
mtxt = clean_json(mtxt)
|
|
80
|
+
try:
|
|
81
|
+
data = json.loads(mtxt)
|
|
82
|
+
except json.JSONDecodeError:
|
|
83
|
+
st.print("ERROR", f"\njson.JSONDecodeError mtxt\n{mtxt}\n\n")
|
|
84
|
+
continue
|
|
85
|
+
|
|
86
|
+
for key, value in data.items():
|
|
87
|
+
err = st.add(key, value)
|
|
88
|
+
if err:
|
|
89
|
+
raise Exception(f"\n\ninit document error:\n\t{err}\n")
|
|
90
|
+
|
|
91
|
+
st.finalize_init()
|
|
92
|
+
|
|
93
|
+
return st
|
|
94
|
+
|
|
95
|
+
def add(self, k: str, value: typing.Any) -> typing.Union[str, None]:
|
|
96
|
+
self.print("WARNING", "add is not implemented")
|
|
97
|
+
|
|
98
|
+
return None
|
|
99
|
+
|
|
100
|
+
def finalize_init(self) -> None:
|
|
101
|
+
self.print("WARNING", "finalize_init is not implemented")
|
|
102
|
+
|
|
103
|
+
def print(self, level: str, msg: str) -> None:
|
|
104
|
+
if not self.logger:
|
|
105
|
+
print(msg)
|
|
106
|
+
return
|
|
107
|
+
|
|
108
|
+
lvl = level.upper()
|
|
109
|
+
if lvl == "ERROR":
|
|
110
|
+
self.logger.error_msg(msg, self.file_name, self.document_id, self.task_id)
|
|
111
|
+
elif lvl == "INFO":
|
|
112
|
+
self.logger.info_msg(msg, self.file_name, self.document_id, self.task_id)
|
|
113
|
+
elif lvl in ("WARN", "WARNING"):
|
|
114
|
+
self.logger.warning_msg(msg, self.file_name, self.document_id, self.task_id)
|
|
115
|
+
else:
|
|
116
|
+
self.logger.debug_msg(msg, self.file_name, self.document_id, self.task_id)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class DocumentRequest(BaseModel):
|
|
120
|
+
model_config = ConfigDict(populate_by_name=True)
|
|
121
|
+
callback_url: str = Field(alias="callbackURL", default="")
|
|
122
|
+
document_id: str = Field(alias="documentID")
|
|
123
|
+
file_name: str = Field(alias="fileName")
|
|
124
|
+
model_id: int = Field(alias="modelID")
|
|
125
|
+
processor_id: int = Field(alias="processorID")
|
|
126
|
+
task_id: str = Field(alias="taskID")
|
|
127
|
+
|
|
128
|
+
_logger: typing.Optional[Logger] = PrivateAttr(default=None)
|
|
129
|
+
|
|
130
|
+
_append_values: bool = PrivateAttr(default_factory=bool)
|
|
131
|
+
_clear_cache: bool = PrivateAttr(default_factory=bool)
|
|
132
|
+
_debug_path: typing.Optional[str] = PrivateAttr(default=None)
|
|
133
|
+
_page_image_dict: typing.Dict[str, int] = PrivateAttr(
|
|
134
|
+
default_factory=typing.Dict[str, int]
|
|
135
|
+
)
|
|
136
|
+
_page_images: typing.List[Image.Image] = PrivateAttr(
|
|
137
|
+
default_factory=typing.List[Image.Image]
|
|
138
|
+
)
|
|
139
|
+
_start: int = PrivateAttr(
|
|
140
|
+
default_factory=lambda: int(datetime.now(timezone.utc).timestamp())
|
|
141
|
+
)
|
|
142
|
+
_write_lock: typing.Optional[typing.Any] = PrivateAttr(default=None)
|
|
143
|
+
|
|
144
|
+
@property
|
|
145
|
+
def append_values(self) -> bool:
|
|
146
|
+
return self._append_values
|
|
147
|
+
|
|
148
|
+
@append_values.setter
|
|
149
|
+
def append_values(self, value: bool) -> None:
|
|
150
|
+
self._append_values = value
|
|
151
|
+
|
|
152
|
+
@append_values.deleter
|
|
153
|
+
def append_values(self) -> None:
|
|
154
|
+
del self._append_values
|
|
155
|
+
|
|
156
|
+
@property
|
|
157
|
+
def clear_cache(self) -> bool:
|
|
158
|
+
return self._clear_cache
|
|
159
|
+
|
|
160
|
+
@clear_cache.setter
|
|
161
|
+
def clear_cache(self, value: bool) -> None:
|
|
162
|
+
self._clear_cache = value
|
|
163
|
+
|
|
164
|
+
@clear_cache.deleter
|
|
165
|
+
def clear_cache(self) -> None:
|
|
166
|
+
del self._clear_cache
|
|
167
|
+
|
|
168
|
+
@property
|
|
169
|
+
def debug_path(self) -> typing.Optional[str]:
|
|
170
|
+
return self._debug_path
|
|
171
|
+
|
|
172
|
+
@debug_path.setter
|
|
173
|
+
def debug_path(self, value: str) -> None:
|
|
174
|
+
self._debug_path = value
|
|
175
|
+
|
|
176
|
+
@debug_path.deleter
|
|
177
|
+
def debug_path(self) -> None:
|
|
178
|
+
del self._debug_path
|
|
179
|
+
|
|
180
|
+
@property
|
|
181
|
+
def logger(self) -> typing.Optional[Logger]:
|
|
182
|
+
if self._logger:
|
|
183
|
+
return self._logger
|
|
184
|
+
|
|
185
|
+
return None
|
|
186
|
+
|
|
187
|
+
@logger.setter
|
|
188
|
+
def logger(self, value: Logger) -> None:
|
|
189
|
+
self._logger = value
|
|
190
|
+
|
|
191
|
+
@logger.deleter
|
|
192
|
+
def logger(self) -> None:
|
|
193
|
+
del self._logger
|
|
194
|
+
|
|
195
|
+
@property
|
|
196
|
+
def page_images(self) -> typing.List[Image.Image]:
|
|
197
|
+
return self._page_images
|
|
198
|
+
|
|
199
|
+
@page_images.setter
|
|
200
|
+
def page_images(self, value: typing.List[Image.Image]) -> None:
|
|
201
|
+
self._page_images = value
|
|
202
|
+
|
|
203
|
+
@page_images.deleter
|
|
204
|
+
def page_images(self) -> None:
|
|
205
|
+
del self._page_images
|
|
206
|
+
|
|
207
|
+
@property
|
|
208
|
+
def page_image_dict(self) -> typing.Dict[str, int]:
|
|
209
|
+
return self._page_image_dict
|
|
210
|
+
|
|
211
|
+
@page_image_dict.setter
|
|
212
|
+
def page_image_dict(self, value: typing.Dict[str, int]) -> None:
|
|
213
|
+
self._page_image_dict = value
|
|
214
|
+
|
|
215
|
+
@page_image_dict.deleter
|
|
216
|
+
def page_image_dict(self) -> None:
|
|
217
|
+
del self._page_image_dict
|
|
218
|
+
|
|
219
|
+
@property
|
|
220
|
+
def start(self) -> int:
|
|
221
|
+
return self._start
|
|
222
|
+
|
|
223
|
+
@property
|
|
224
|
+
def write_lock(self) -> typing.Optional[typing.Any]:
|
|
225
|
+
return self._write_lock
|
|
226
|
+
|
|
227
|
+
@write_lock.setter
|
|
228
|
+
def write_lock(self, value: typing.Optional[typing.Any]) -> None:
|
|
229
|
+
self._write_lock = value
|
|
230
|
+
|
|
231
|
+
@write_lock.deleter
|
|
232
|
+
def write_lock(self) -> None:
|
|
233
|
+
del self._write_lock
|
|
234
|
+
|
|
235
|
+
def clear_debug(self) -> None:
|
|
236
|
+
if self.debug_path:
|
|
237
|
+
file_path = f"{self.debug_path}/{self.file_name.replace('.pdf','')}"
|
|
238
|
+
shutil.rmtree(file_path, ignore_errors=True)
|
|
239
|
+
|
|
240
|
+
def load_images(
|
|
241
|
+
self,
|
|
242
|
+
imgs: typing.List[str],
|
|
243
|
+
attempt: int = 0,
|
|
244
|
+
should_sleep: bool = True,
|
|
245
|
+
) -> typing.List[Image.Image]:
|
|
246
|
+
pageImages: typing.List[Image.Image] = []
|
|
247
|
+
for page in imgs:
|
|
248
|
+
if page in self.page_image_dict:
|
|
249
|
+
self.print(
|
|
250
|
+
"WARN",
|
|
251
|
+
f"[{attempt}] loading cached [{self.page_image_dict[page]}] [{page}]",
|
|
252
|
+
)
|
|
253
|
+
pageImages.append(self.page_images[self.page_image_dict[page]])
|
|
254
|
+
else:
|
|
255
|
+
try:
|
|
256
|
+
self.print("WARN", f"[{attempt}] downloading [{page}]")
|
|
257
|
+
resp = requests.get(page)
|
|
258
|
+
resp.raise_for_status()
|
|
259
|
+
img = Image.open(BytesIO(resp.content))
|
|
260
|
+
if img:
|
|
261
|
+
self.page_image_dict[page] = len(self.page_images)
|
|
262
|
+
self.page_images.append(img)
|
|
263
|
+
pageImages.append(img)
|
|
264
|
+
except Exception as e:
|
|
265
|
+
self.print(
|
|
266
|
+
"ERROR", f"[{attempt}] Failed to load image from {page}: {e}"
|
|
267
|
+
)
|
|
268
|
+
if attempt < 2:
|
|
269
|
+
if should_sleep:
|
|
270
|
+
time.sleep(2 * attempt + 1)
|
|
271
|
+
return self.load_images(
|
|
272
|
+
imgs, attempt + 1, should_sleep=should_sleep
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
return pageImages
|
|
276
|
+
|
|
277
|
+
def print(self, level: str, msg: str) -> None:
|
|
278
|
+
if not self.logger:
|
|
279
|
+
print(msg)
|
|
280
|
+
return
|
|
281
|
+
|
|
282
|
+
lvl = level.upper()
|
|
283
|
+
if lvl == "ERROR":
|
|
284
|
+
self.logger.error_msg(msg, self.file_name, self.document_id, self.task_id)
|
|
285
|
+
elif lvl == "INFO":
|
|
286
|
+
self.logger.info_msg(msg, self.file_name, self.document_id, self.task_id)
|
|
287
|
+
elif lvl in ("WARN", "WARNING"):
|
|
288
|
+
self.logger.warning_msg(msg, self.file_name, self.document_id, self.task_id)
|
|
289
|
+
else:
|
|
290
|
+
self.logger.debug_msg(msg, self.file_name, self.document_id, self.task_id)
|
|
291
|
+
|
|
292
|
+
def write_debug(self, file_name: str, data: typing.Any) -> None:
|
|
293
|
+
if not self.debug_path:
|
|
294
|
+
return
|
|
295
|
+
|
|
296
|
+
os.makedirs(self.debug_path, exist_ok=True)
|
|
297
|
+
file_path = f"{self.debug_path}/{self.file_name.replace('.pdf','')}"
|
|
298
|
+
os.makedirs(file_path, exist_ok=True)
|
|
299
|
+
|
|
300
|
+
if not isinstance(data, str):
|
|
301
|
+
try:
|
|
302
|
+
data = json.dumps(data)
|
|
303
|
+
except Exception as e:
|
|
304
|
+
if isinstance(data, Exception):
|
|
305
|
+
data = str(data)
|
|
306
|
+
else:
|
|
307
|
+
self.print("ERROR", f"write_debug exception: {e}")
|
|
308
|
+
raise e
|
|
309
|
+
|
|
310
|
+
with open(f"{file_path}/{self.start}_{file_name}", "w", encoding="utf-8") as f:
|
|
311
|
+
f.write(data)
|