groundx 2.0.15__py3-none-any.whl → 2.7.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- groundx/__init__.py +73 -21
- groundx/buckets/__init__.py +2 -0
- groundx/buckets/client.py +55 -388
- groundx/buckets/raw_client.py +628 -0
- groundx/client.py +22 -21
- groundx/core/__init__.py +5 -0
- groundx/core/api_error.py +13 -5
- groundx/core/client_wrapper.py +4 -3
- groundx/core/force_multipart.py +16 -0
- groundx/core/http_client.py +76 -32
- groundx/core/http_response.py +55 -0
- groundx/core/jsonable_encoder.py +0 -1
- groundx/core/pydantic_utilities.py +71 -112
- groundx/core/serialization.py +7 -3
- groundx/csv_splitter.py +64 -0
- groundx/customer/__init__.py +2 -0
- groundx/customer/client.py +31 -43
- groundx/customer/raw_client.py +91 -0
- groundx/documents/__init__.py +1 -2
- groundx/documents/client.py +455 -953
- groundx/documents/raw_client.py +1450 -0
- groundx/errors/__init__.py +2 -0
- groundx/errors/bad_request_error.py +4 -3
- groundx/errors/unauthorized_error.py +4 -3
- groundx/extract/__init__.py +48 -0
- groundx/extract/agents/__init__.py +7 -0
- groundx/extract/agents/agent.py +202 -0
- groundx/extract/classes/__init__.py +24 -0
- groundx/extract/classes/agent.py +23 -0
- groundx/extract/classes/api.py +15 -0
- groundx/extract/classes/document.py +338 -0
- groundx/extract/classes/field.py +88 -0
- groundx/extract/classes/groundx.py +147 -0
- groundx/extract/classes/prompt.py +36 -0
- groundx/extract/classes/test_document.py +109 -0
- groundx/extract/classes/test_field.py +43 -0
- groundx/extract/classes/test_groundx.py +223 -0
- groundx/extract/classes/test_prompt.py +68 -0
- groundx/extract/post_process/__init__.py +7 -0
- groundx/extract/post_process/post_process.py +33 -0
- groundx/extract/services/.DS_Store +0 -0
- groundx/extract/services/__init__.py +14 -0
- groundx/extract/services/csv.py +76 -0
- groundx/extract/services/logger.py +126 -0
- groundx/extract/services/logging_cfg.py +53 -0
- groundx/extract/services/ratelimit.py +104 -0
- groundx/extract/services/sheets_client.py +160 -0
- groundx/extract/services/status.py +197 -0
- groundx/extract/services/upload.py +68 -0
- groundx/extract/services/upload_minio.py +122 -0
- groundx/extract/services/upload_s3.py +91 -0
- groundx/extract/services/utility.py +52 -0
- groundx/extract/settings/__init__.py +15 -0
- groundx/extract/settings/settings.py +212 -0
- groundx/extract/settings/test_settings.py +512 -0
- groundx/extract/tasks/__init__.py +6 -0
- groundx/extract/tasks/utility.py +27 -0
- groundx/extract/utility/__init__.py +15 -0
- groundx/extract/utility/classes.py +193 -0
- groundx/extract/utility/test_utility.py +81 -0
- groundx/groups/__init__.py +2 -0
- groundx/groups/client.py +63 -550
- groundx/groups/raw_client.py +901 -0
- groundx/health/__init__.py +2 -0
- groundx/health/client.py +35 -101
- groundx/health/raw_client.py +193 -0
- groundx/ingest.py +771 -0
- groundx/search/__init__.py +2 -0
- groundx/search/client.py +94 -227
- groundx/search/raw_client.py +442 -0
- groundx/search/types/__init__.py +2 -0
- groundx/types/__init__.py +68 -16
- groundx/types/bounding_box_detail.py +4 -4
- groundx/types/bucket_detail.py +5 -5
- groundx/types/bucket_list_response.py +17 -3
- groundx/types/bucket_response.py +3 -3
- groundx/types/bucket_update_detail.py +4 -4
- groundx/types/bucket_update_response.py +3 -3
- groundx/types/customer_detail.py +2 -2
- groundx/types/customer_response.py +3 -3
- groundx/types/document.py +54 -0
- groundx/types/document_detail.py +16 -4
- groundx/types/document_list_response.py +4 -4
- groundx/types/document_local_ingest_request.py +7 -0
- groundx/types/document_lookup_response.py +8 -3
- groundx/types/document_response.py +3 -3
- groundx/types/document_type.py +21 -1
- groundx/types/group_detail.py +4 -4
- groundx/types/group_list_response.py +17 -3
- groundx/types/group_response.py +3 -3
- groundx/types/health_response.py +3 -3
- groundx/types/health_response_health.py +3 -3
- groundx/types/health_service.py +5 -5
- groundx/types/ingest_local_document.py +25 -0
- groundx/types/ingest_local_document_metadata.py +51 -0
- groundx/types/ingest_remote_document.py +15 -6
- groundx/types/ingest_response.py +4 -4
- groundx/types/{process_status_response_ingest.py → ingest_status.py} +8 -7
- groundx/types/{ingest_response_ingest.py → ingest_status_light.py} +7 -5
- groundx/types/ingest_status_progress.py +26 -0
- groundx/types/{process_status_response_ingest_progress_errors.py → ingest_status_progress_cancelled.py} +4 -4
- groundx/types/{process_status_response_ingest_progress_complete.py → ingest_status_progress_complete.py} +4 -4
- groundx/types/{process_status_response_ingest_progress_cancelled.py → ingest_status_progress_errors.py} +4 -4
- groundx/types/{process_status_response_ingest_progress_processing.py → ingest_status_progress_processing.py} +4 -4
- groundx/types/message_response.py +2 -2
- groundx/types/meter_detail.py +2 -2
- groundx/types/process_level.py +5 -0
- groundx/types/{process_status_response.py → processes_status_response.py} +8 -5
- groundx/types/processing_status.py +3 -1
- groundx/types/search_response.py +3 -3
- groundx/types/search_response_search.py +3 -3
- groundx/types/search_result_item.py +7 -5
- groundx/types/search_result_item_pages_item.py +41 -0
- groundx/types/subscription_detail.py +3 -3
- groundx/types/subscription_detail_meters.py +5 -5
- groundx/{documents/types/website_crawl_request_websites_item.py → types/website_source.py} +7 -7
- groundx/types/workflow_apply_request.py +24 -0
- groundx/types/workflow_detail.py +59 -0
- groundx/types/workflow_detail_chunk_strategy.py +5 -0
- groundx/types/workflow_detail_relationships.py +36 -0
- groundx/types/workflow_engine.py +58 -0
- groundx/types/workflow_engine_reasoning_effort.py +5 -0
- groundx/types/workflow_engine_service.py +7 -0
- groundx/types/workflow_prompt.py +37 -0
- groundx/types/workflow_prompt_group.py +25 -0
- groundx/types/workflow_prompt_role.py +5 -0
- groundx/types/workflow_request.py +31 -0
- groundx/types/workflow_request_chunk_strategy.py +5 -0
- groundx/types/workflow_response.py +20 -0
- groundx/types/workflow_step.py +33 -0
- groundx/types/workflow_step_config.py +33 -0
- groundx/types/workflow_step_config_field.py +8 -0
- groundx/types/workflow_steps.py +38 -0
- groundx/types/workflows_response.py +20 -0
- groundx/workflows/__init__.py +7 -0
- groundx/workflows/client.py +736 -0
- groundx/workflows/raw_client.py +841 -0
- groundx/workflows/types/__init__.py +7 -0
- groundx/workflows/types/workflows_get_request_id.py +5 -0
- {groundx-2.0.15.dist-info → groundx-2.7.7.dist-info}/LICENSE +1 -1
- {groundx-2.0.15.dist-info → groundx-2.7.7.dist-info}/METADATA +39 -22
- groundx-2.7.7.dist-info/RECORD +155 -0
- groundx/documents/types/__init__.py +0 -6
- groundx/documents/types/documents_ingest_local_request_files_item.py +0 -43
- groundx/types/process_status_response_ingest_progress.py +0 -26
- groundx-2.0.15.dist-info/RECORD +0 -82
- {groundx-2.0.15.dist-info → groundx-2.7.7.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import dateparser, typing
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ExtractedField(BaseModel):
|
|
7
|
+
confidence: typing.Optional[str] = None
|
|
8
|
+
conflicts: typing.List[typing.Any] = []
|
|
9
|
+
key: str
|
|
10
|
+
|
|
11
|
+
value: typing.Union[str, float, typing.List[typing.Any]] = ""
|
|
12
|
+
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
value: typing.Union[str, float, typing.List[typing.Any]],
|
|
16
|
+
**data: typing.Any,
|
|
17
|
+
):
|
|
18
|
+
super().__init__(**data)
|
|
19
|
+
|
|
20
|
+
self.set_value(value)
|
|
21
|
+
|
|
22
|
+
def contains(self, other: "ExtractedField") -> bool:
|
|
23
|
+
self_val = self.get_value()
|
|
24
|
+
other_val = other.get_value()
|
|
25
|
+
if not (isinstance(self_val, (str, float, int))):
|
|
26
|
+
raise Exception(f"unexpected self field value type [{type(self_val)}]")
|
|
27
|
+
|
|
28
|
+
if self.equal_to_value(other_val):
|
|
29
|
+
return True
|
|
30
|
+
|
|
31
|
+
if other_val in self.conflicts:
|
|
32
|
+
return True
|
|
33
|
+
|
|
34
|
+
return False
|
|
35
|
+
|
|
36
|
+
def equal_to_field(self, other: "ExtractedField") -> bool:
|
|
37
|
+
self_val = self.get_value()
|
|
38
|
+
other_val = other.get_value()
|
|
39
|
+
if not (isinstance(self_val, (str, float, int))):
|
|
40
|
+
raise Exception(f"unexpected self field value type [{type(self_val)}]")
|
|
41
|
+
|
|
42
|
+
return self.equal_to_value(other_val)
|
|
43
|
+
|
|
44
|
+
def equal_to_value(self, other: typing.Any) -> bool:
|
|
45
|
+
if not (isinstance(other, (str, float, int))):
|
|
46
|
+
raise Exception(f"unexpected value type [{type(other)}]")
|
|
47
|
+
|
|
48
|
+
exist = self.get_value()
|
|
49
|
+
if isinstance(exist, int):
|
|
50
|
+
exist = float(exist)
|
|
51
|
+
if isinstance(other, int):
|
|
52
|
+
other = float(other)
|
|
53
|
+
if isinstance(exist, str):
|
|
54
|
+
exist = exist.lower()
|
|
55
|
+
if isinstance(other, str):
|
|
56
|
+
other = other.lower()
|
|
57
|
+
|
|
58
|
+
return type(other) == type(exist) and other == exist
|
|
59
|
+
|
|
60
|
+
def get_value(self) -> typing.Union[str, float, typing.List[typing.Any]]:
|
|
61
|
+
return self.value
|
|
62
|
+
|
|
63
|
+
def remove_conflict(self, value: typing.Any) -> None:
|
|
64
|
+
if value in self.conflicts:
|
|
65
|
+
self.conflicts.remove(value)
|
|
66
|
+
if not self.equal_to_value(value):
|
|
67
|
+
self.conflicts.append(self.get_value())
|
|
68
|
+
|
|
69
|
+
def set_value(
|
|
70
|
+
self, value: typing.Union[str, float, typing.List[typing.Any]]
|
|
71
|
+
) -> None:
|
|
72
|
+
if isinstance(value, int):
|
|
73
|
+
self.value = float(value)
|
|
74
|
+
elif isinstance(value, str) and "date" in self.key.lower():
|
|
75
|
+
try:
|
|
76
|
+
dt = dateparser.parse(value)
|
|
77
|
+
if dt is None:
|
|
78
|
+
self.value = value
|
|
79
|
+
else:
|
|
80
|
+
self.value = dt.strftime("%Y-%m-%d")
|
|
81
|
+
except Exception as e:
|
|
82
|
+
print(f"date error [{value}]: [{e}]")
|
|
83
|
+
self.value = value
|
|
84
|
+
else:
|
|
85
|
+
self.value = value
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
ExtractedField.model_rebuild()
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
import json, requests, typing
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
5
|
+
|
|
6
|
+
from ..services.upload import Upload
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class GroundXDocument(BaseModel):
|
|
10
|
+
model_config = ConfigDict(populate_by_name=True)
|
|
11
|
+
base_url: str
|
|
12
|
+
document_id: str = Field(alias="documentID")
|
|
13
|
+
task_id: str = Field(alias="taskID")
|
|
14
|
+
|
|
15
|
+
def xray_path(self) -> str:
|
|
16
|
+
return f"layout/processed/{self.task_id}/{self.document_id}-xray.json"
|
|
17
|
+
|
|
18
|
+
def xray_url(self, base: typing.Optional[str] = None) -> str:
|
|
19
|
+
if not base:
|
|
20
|
+
base = self.base_url
|
|
21
|
+
if base.endswith("/"):
|
|
22
|
+
base = base[:-1]
|
|
23
|
+
return f"{base}/layout/processed/{self.task_id}/{self.document_id}-xray.json"
|
|
24
|
+
|
|
25
|
+
def xray(
|
|
26
|
+
self,
|
|
27
|
+
cache_dir: Path,
|
|
28
|
+
upload: typing.Optional[Upload] = None,
|
|
29
|
+
clear_cache: bool = False,
|
|
30
|
+
is_test: bool = False,
|
|
31
|
+
base: typing.Optional[str] = None,
|
|
32
|
+
) -> "XRayDocument":
|
|
33
|
+
return XRayDocument.download(
|
|
34
|
+
self,
|
|
35
|
+
cache_dir=cache_dir,
|
|
36
|
+
upload=upload,
|
|
37
|
+
clear_cache=clear_cache,
|
|
38
|
+
is_test=is_test,
|
|
39
|
+
base=base,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class GroundXResponse(BaseModel):
|
|
44
|
+
code: int
|
|
45
|
+
document_id: str = Field(alias="documentID")
|
|
46
|
+
model_id: int = Field(alias="modelID")
|
|
47
|
+
processor_id: int = Field(alias="processorID")
|
|
48
|
+
result_url: str = Field(alias="resultURL")
|
|
49
|
+
task_id: str = Field(alias="taskID")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class BoundingBox(BaseModel):
|
|
53
|
+
bottomRightX: float
|
|
54
|
+
bottomRightY: float
|
|
55
|
+
topLeftX: float
|
|
56
|
+
topLeftY: float
|
|
57
|
+
corrected: typing.Optional[bool]
|
|
58
|
+
pageNumber: typing.Optional[int]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class Chunk(BaseModel):
|
|
62
|
+
boundingBoxes: typing.Optional[typing.List[BoundingBox]] = []
|
|
63
|
+
chunk: typing.Optional[str] = None
|
|
64
|
+
contentType: typing.Optional[typing.List[str]] = []
|
|
65
|
+
json_: typing.Optional[typing.List[typing.Any]] = Field(None, alias="json")
|
|
66
|
+
multimodalUrl: typing.Optional[str] = None
|
|
67
|
+
narrative: typing.Optional[typing.List[str]] = None
|
|
68
|
+
pageNumbers: typing.Optional[typing.List[int]] = []
|
|
69
|
+
sectionSummary: typing.Optional[str] = None
|
|
70
|
+
suggestedText: typing.Optional[str] = None
|
|
71
|
+
text: typing.Optional[str] = None
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class DocumentPage(BaseModel):
|
|
75
|
+
chunks: typing.List[Chunk]
|
|
76
|
+
height: float
|
|
77
|
+
pageNumber: int
|
|
78
|
+
pageUrl: str
|
|
79
|
+
width: float
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class XRayDocument(BaseModel):
|
|
83
|
+
chunks: typing.List[Chunk]
|
|
84
|
+
documentPages: typing.List[DocumentPage] = []
|
|
85
|
+
sourceUrl: str
|
|
86
|
+
fileKeywords: typing.Optional[str] = None
|
|
87
|
+
fileName: typing.Optional[str] = None
|
|
88
|
+
fileType: typing.Optional[str] = None
|
|
89
|
+
fileSummary: typing.Optional[str] = None
|
|
90
|
+
language: typing.Optional[str] = None
|
|
91
|
+
|
|
92
|
+
@classmethod
|
|
93
|
+
def download(
|
|
94
|
+
cls,
|
|
95
|
+
gx_doc: GroundXDocument,
|
|
96
|
+
cache_dir: Path,
|
|
97
|
+
upload: typing.Optional[Upload] = None,
|
|
98
|
+
clear_cache: bool = False,
|
|
99
|
+
is_test: bool = False,
|
|
100
|
+
base: typing.Optional[str] = None,
|
|
101
|
+
) -> "XRayDocument":
|
|
102
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
103
|
+
cache_file = cache_dir / f"{gx_doc.document_id}-xray.json"
|
|
104
|
+
|
|
105
|
+
if not clear_cache and cache_file.exists():
|
|
106
|
+
try:
|
|
107
|
+
with cache_file.open("r", encoding="utf-8") as f:
|
|
108
|
+
payload = json.load(f)
|
|
109
|
+
|
|
110
|
+
return cls(**payload)
|
|
111
|
+
except Exception as e:
|
|
112
|
+
raise RuntimeError(
|
|
113
|
+
f"Error loading cached X-ray JSON from {cache_file}: {e}"
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
if upload:
|
|
117
|
+
path = gx_doc.xray_path()
|
|
118
|
+
ru = upload.get_object(path)
|
|
119
|
+
if ru:
|
|
120
|
+
try:
|
|
121
|
+
payload = json.loads(ru.decode("utf-8"))
|
|
122
|
+
return cls(**payload)
|
|
123
|
+
except Exception as e:
|
|
124
|
+
raise RuntimeError(
|
|
125
|
+
f"Error decoding X-ray JSON bytes from {path}: {e}"
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
url = gx_doc.xray_url(base=base)
|
|
129
|
+
try:
|
|
130
|
+
resp = requests.get(url)
|
|
131
|
+
resp.raise_for_status()
|
|
132
|
+
except requests.RequestException as e:
|
|
133
|
+
raise RuntimeError(f"Error fetching X-ray JSON from {url}: {e}")
|
|
134
|
+
|
|
135
|
+
try:
|
|
136
|
+
payload = resp.json()
|
|
137
|
+
except ValueError as e:
|
|
138
|
+
raise RuntimeError(f"Invalid JSON returned from {url}: {e}")
|
|
139
|
+
|
|
140
|
+
if is_test is False:
|
|
141
|
+
try:
|
|
142
|
+
with cache_file.open("w", encoding="utf-8") as f:
|
|
143
|
+
json.dump(payload, f)
|
|
144
|
+
except Exception as e:
|
|
145
|
+
print(f"Warning: failed to write X-ray JSON cache to {cache_file}: {e}")
|
|
146
|
+
|
|
147
|
+
return cls(**payload)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import typing
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
|
|
5
|
+
from ..utility.classes import str_to_type_sequence
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Prompt(BaseModel):
|
|
9
|
+
attr_name: str
|
|
10
|
+
prompt: str
|
|
11
|
+
type: typing.Union[str, typing.List[str]]
|
|
12
|
+
|
|
13
|
+
class Config:
|
|
14
|
+
validate_by_name = True
|
|
15
|
+
|
|
16
|
+
def valid_value(self, value: typing.Any) -> bool:
|
|
17
|
+
ty = self.type
|
|
18
|
+
|
|
19
|
+
types: typing.List[typing.Type[typing.Any]] = []
|
|
20
|
+
if isinstance(ty, list):
|
|
21
|
+
for t in ty:
|
|
22
|
+
if t == "int" or t == "float":
|
|
23
|
+
types.extend([int, float])
|
|
24
|
+
elif t == "str":
|
|
25
|
+
types.append(str)
|
|
26
|
+
|
|
27
|
+
return isinstance(value, tuple(types))
|
|
28
|
+
|
|
29
|
+
exp = str_to_type_sequence(ty)
|
|
30
|
+
for et in exp:
|
|
31
|
+
if et in (int, float):
|
|
32
|
+
types.extend([int, float])
|
|
33
|
+
else:
|
|
34
|
+
types.append(et)
|
|
35
|
+
types = list(dict.fromkeys(types))
|
|
36
|
+
return isinstance(value, tuple(types))
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import pytest, typing, unittest
|
|
2
|
+
|
|
3
|
+
pytest.importorskip("PIL")
|
|
4
|
+
|
|
5
|
+
from io import BytesIO
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from PIL import Image
|
|
8
|
+
from unittest.mock import patch
|
|
9
|
+
|
|
10
|
+
from .document import Document, DocumentRequest
|
|
11
|
+
from .test_groundx import TestXRay
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def DR(**data: typing.Any) -> DocumentRequest:
|
|
15
|
+
return DocumentRequest.model_validate(data)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def test_doc() -> Document:
|
|
19
|
+
return Document.from_request(
|
|
20
|
+
cache_dir=Path("./cache"),
|
|
21
|
+
base_url="",
|
|
22
|
+
req=test_request(),
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_request() -> DocumentRequest:
|
|
27
|
+
return DR(documentID="D", fileName="F", modelID=1, processorID=1, taskID="T")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class TestDocument(unittest.TestCase):
|
|
31
|
+
def setUp(self) -> None:
|
|
32
|
+
patcher = patch(
|
|
33
|
+
"groundx.extract.classes.document.GroundXDocument.xray", autospec=True
|
|
34
|
+
)
|
|
35
|
+
self.mock_xray = patcher.start()
|
|
36
|
+
self.addCleanup(patcher.stop)
|
|
37
|
+
self.mock_xray.return_value = TestXRay("http://test.co", [])
|
|
38
|
+
|
|
39
|
+
def test_init_name(self) -> None:
|
|
40
|
+
st1: Document = test_doc()
|
|
41
|
+
self.assertEqual(st1.file_name, "F")
|
|
42
|
+
st2: Document = Document.from_request(
|
|
43
|
+
cache_dir=Path("./cache"),
|
|
44
|
+
base_url="",
|
|
45
|
+
req=DR(
|
|
46
|
+
documentID="D", fileName="F.pdf", modelID=1, processorID=1, taskID="T"
|
|
47
|
+
),
|
|
48
|
+
)
|
|
49
|
+
self.assertEqual(st2.file_name, "F.pdf")
|
|
50
|
+
st3: Document = Document.from_request(
|
|
51
|
+
cache_dir=Path("./cache"),
|
|
52
|
+
base_url="",
|
|
53
|
+
req=DR(documentID="D", fileName="F.", modelID=1, processorID=1, taskID="T"),
|
|
54
|
+
)
|
|
55
|
+
self.assertEqual(st3.file_name, "F.")
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class TestDocumentRequest(unittest.TestCase):
|
|
59
|
+
def test_load_images_cached(self) -> None:
|
|
60
|
+
urls: typing.List[str] = [
|
|
61
|
+
"http://example.com/page1.png",
|
|
62
|
+
"http://example.com/page2.png",
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
red_img = Image.new("RGB", (10, 10), color="red")
|
|
66
|
+
buf = BytesIO()
|
|
67
|
+
red_img.save(buf, format="PNG")
|
|
68
|
+
|
|
69
|
+
st = test_request()
|
|
70
|
+
st.page_images = [red_img, red_img]
|
|
71
|
+
st.page_image_dict = {
|
|
72
|
+
urls[0]: 0,
|
|
73
|
+
urls[1]: 1,
|
|
74
|
+
}
|
|
75
|
+
st.load_images(urls)
|
|
76
|
+
self.assertEqual(len(st.page_images), 2)
|
|
77
|
+
self.assertEqual(len(st.page_image_dict), 2)
|
|
78
|
+
|
|
79
|
+
def test_load_images_download(self) -> None:
|
|
80
|
+
urls = ["http://example.com/page1.png", "http://example.com/page2.png"]
|
|
81
|
+
|
|
82
|
+
red_img = Image.new("RGB", (10, 10), color="red")
|
|
83
|
+
buf = BytesIO()
|
|
84
|
+
red_img.save(buf, format="PNG")
|
|
85
|
+
img_bytes = buf.getvalue()
|
|
86
|
+
|
|
87
|
+
class TestResp:
|
|
88
|
+
content = img_bytes
|
|
89
|
+
|
|
90
|
+
def raise_for_status(self) -> None:
|
|
91
|
+
pass
|
|
92
|
+
|
|
93
|
+
with patch("requests.get", return_value=TestResp()):
|
|
94
|
+
st = test_request()
|
|
95
|
+
st.load_images(urls)
|
|
96
|
+
|
|
97
|
+
self.assertEqual(len(st.page_images), 2)
|
|
98
|
+
self.assertEqual(len(st.page_image_dict), 2)
|
|
99
|
+
for img in st.page_images:
|
|
100
|
+
self.assertIsInstance(img, Image.Image)
|
|
101
|
+
self.assertEqual(img.size, (10, 10))
|
|
102
|
+
|
|
103
|
+
def test_load_images_error(self) -> None:
|
|
104
|
+
urls = ["http://example.com/page1.png", "http://example.com/page2.png"]
|
|
105
|
+
|
|
106
|
+
st = test_request()
|
|
107
|
+
st.load_images(urls, should_sleep=False)
|
|
108
|
+
self.assertEqual(len(st.page_images), 0)
|
|
109
|
+
self.assertEqual(len(st.page_image_dict), 0)
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import pytest, typing, unittest
|
|
2
|
+
|
|
3
|
+
pytest.importorskip("dateparser")
|
|
4
|
+
|
|
5
|
+
from .field import ExtractedField
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def TestField(
|
|
9
|
+
name: str,
|
|
10
|
+
value: typing.Union[str, float, typing.List[typing.Any]],
|
|
11
|
+
conflicts: typing.List[typing.Any] = [],
|
|
12
|
+
) -> ExtractedField:
|
|
13
|
+
return ExtractedField(
|
|
14
|
+
key=name.replace("_", " "),
|
|
15
|
+
value=value,
|
|
16
|
+
conflicts=conflicts,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class TestExtractedField(unittest.TestCase):
|
|
21
|
+
def test_equalToValue_string(self):
|
|
22
|
+
ef = TestField("test", "hello")
|
|
23
|
+
self.assertTrue(ef.equal_to_value("hello"))
|
|
24
|
+
self.assertFalse(ef.equal_to_value("world"))
|
|
25
|
+
|
|
26
|
+
def test_equalToValue_int_float_equivalence(self):
|
|
27
|
+
ef = TestField("test", int(10))
|
|
28
|
+
self.assertTrue(ef.equal_to_value(10.0))
|
|
29
|
+
self.assertTrue(ef.equal_to_value(10))
|
|
30
|
+
|
|
31
|
+
def test_equalToValue_mismatch(self):
|
|
32
|
+
ef = TestField("test", 3.14)
|
|
33
|
+
self.assertFalse(ef.equal_to_value(2.71))
|
|
34
|
+
|
|
35
|
+
def test_set_value_dates(self):
|
|
36
|
+
ef1 = TestField("test date", "3/29/25")
|
|
37
|
+
self.assertEqual(ef1.get_value(), "2025-03-29")
|
|
38
|
+
ef2 = TestField("test date", "2025-03-29")
|
|
39
|
+
self.assertEqual(ef2.get_value(), "2025-03-29")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
if __name__ == "__main__":
|
|
43
|
+
unittest.main()
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
import requests, typing, unittest
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from unittest.mock import patch
|
|
4
|
+
|
|
5
|
+
from pydantic import ValidationError
|
|
6
|
+
|
|
7
|
+
from .groundx import (
|
|
8
|
+
GroundXDocument,
|
|
9
|
+
XRayDocument,
|
|
10
|
+
Chunk,
|
|
11
|
+
BoundingBox,
|
|
12
|
+
DocumentPage,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TestChunk:
|
|
17
|
+
def __init__(self, json_str: str):
|
|
18
|
+
self.sectionSummary = None
|
|
19
|
+
self.suggestedText = json_str
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class TestDocumentPage:
|
|
23
|
+
def __init__(self, page_url: str):
|
|
24
|
+
self.pageUrl = page_url
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class TestXRay:
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
source_url: str,
|
|
31
|
+
chunks: typing.Optional[typing.List[TestChunk]] = [],
|
|
32
|
+
document_pages: typing.Optional[typing.List[str]] = [],
|
|
33
|
+
):
|
|
34
|
+
self.chunks = chunks
|
|
35
|
+
self.documentPages: typing.List[TestDocumentPage] = []
|
|
36
|
+
if document_pages is not None:
|
|
37
|
+
for p in document_pages:
|
|
38
|
+
self.documentPages.append(TestDocumentPage(p))
|
|
39
|
+
self.sourceUrl = source_url
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def GD(**data: typing.Any) -> GroundXDocument:
|
|
43
|
+
return GroundXDocument.model_validate(data)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_xray(gx: GroundXDocument) -> XRayDocument:
|
|
47
|
+
return XRayDocument.download(
|
|
48
|
+
gx, cache_dir=Path("./cache"), base="https://upload.test", is_test=True
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class TestGroundX(unittest.TestCase):
|
|
53
|
+
def make_dummy_response(
|
|
54
|
+
self,
|
|
55
|
+
payload: typing.Optional[typing.Dict[str, typing.Any]] = None,
|
|
56
|
+
status_ok: bool = True,
|
|
57
|
+
json_error: bool = False,
|
|
58
|
+
) -> typing.Any:
|
|
59
|
+
class DummyResponse:
|
|
60
|
+
def raise_for_status(self):
|
|
61
|
+
if not status_ok:
|
|
62
|
+
raise requests.HTTPError("HTTP error!")
|
|
63
|
+
|
|
64
|
+
def json(self):
|
|
65
|
+
if json_error:
|
|
66
|
+
raise ValueError("Bad JSON!")
|
|
67
|
+
return payload
|
|
68
|
+
|
|
69
|
+
return DummyResponse()
|
|
70
|
+
|
|
71
|
+
def test_xray_url(self):
|
|
72
|
+
gx = GD(base_url="", documentID="doc123", taskID="taskABC")
|
|
73
|
+
expected = "https://upload.test/layout/processed/taskABC/doc123-xray.json"
|
|
74
|
+
self.assertEqual(gx.xray_url(base="https://upload.test"), expected)
|
|
75
|
+
|
|
76
|
+
def test_download_success(self):
|
|
77
|
+
payload: typing.Dict[str, typing.Any] = {
|
|
78
|
+
"chunks": [],
|
|
79
|
+
"documentPages": [],
|
|
80
|
+
"sourceUrl": "https://example.com/foo.pdf",
|
|
81
|
+
}
|
|
82
|
+
dummy = self.make_dummy_response(payload=payload, status_ok=True)
|
|
83
|
+
with patch("requests.get", return_value=dummy):
|
|
84
|
+
gx = GD(base_url="", documentID="D", taskID="T")
|
|
85
|
+
xdoc = test_xray(gx)
|
|
86
|
+
self.assertIsInstance(xdoc, XRayDocument)
|
|
87
|
+
self.assertEqual(xdoc.chunks, [])
|
|
88
|
+
self.assertEqual(xdoc.documentPages, [])
|
|
89
|
+
self.assertEqual(xdoc.sourceUrl, payload["sourceUrl"])
|
|
90
|
+
|
|
91
|
+
def test_download_request_exception(self):
|
|
92
|
+
with patch("requests.get", side_effect=requests.RequestException("no network")):
|
|
93
|
+
gx = GD(base_url="", documentID="D", taskID="T")
|
|
94
|
+
with self.assertRaises(RuntimeError) as cm:
|
|
95
|
+
test_xray(gx)
|
|
96
|
+
self.assertIn("Error fetching X-ray JSON", str(cm.exception))
|
|
97
|
+
|
|
98
|
+
def test_download_http_error(self):
|
|
99
|
+
dummy = self.make_dummy_response(payload={}, status_ok=False)
|
|
100
|
+
with patch("requests.get", return_value=dummy):
|
|
101
|
+
gx = GD(base_url="", documentID="D", taskID="T")
|
|
102
|
+
with self.assertRaises(RuntimeError) as cm:
|
|
103
|
+
test_xray(gx)
|
|
104
|
+
self.assertIn("HTTP error!", str(cm.exception))
|
|
105
|
+
|
|
106
|
+
def test_download_json_error(self):
|
|
107
|
+
dummy = self.make_dummy_response(payload=None, status_ok=True, json_error=True)
|
|
108
|
+
with patch("requests.get", return_value=dummy):
|
|
109
|
+
gx = GD(base_url="", documentID="D", taskID="T")
|
|
110
|
+
with self.assertRaises(RuntimeError) as cm:
|
|
111
|
+
test_xray(gx)
|
|
112
|
+
self.assertIn("Invalid JSON returned", str(cm.exception))
|
|
113
|
+
|
|
114
|
+
def test_validation_error_on_missing_required_fields(self) -> None:
|
|
115
|
+
payload: typing.Dict[str, typing.Any] = {
|
|
116
|
+
"documentPages": [],
|
|
117
|
+
"sourceUrl": "https://example.com/foo.pdf",
|
|
118
|
+
}
|
|
119
|
+
dummy = self.make_dummy_response(payload=payload, status_ok=True)
|
|
120
|
+
with patch("requests.get", return_value=dummy):
|
|
121
|
+
gx = GD(base_url="", documentID="D", taskID="T")
|
|
122
|
+
with self.assertRaises(ValidationError) as cm:
|
|
123
|
+
test_xray(gx)
|
|
124
|
+
self.assertIn("Field required", str(cm.exception))
|
|
125
|
+
|
|
126
|
+
def test_xray_method_delegates_to_download(self) -> None:
|
|
127
|
+
gx = GD(base_url="", documentID="X", taskID="Y")
|
|
128
|
+
|
|
129
|
+
sentinel = object()
|
|
130
|
+
with patch.object(XRayDocument, "download", return_value=sentinel):
|
|
131
|
+
result = gx.xray(
|
|
132
|
+
cache_dir=Path("./cache"), base="https://upload.test", is_test=True
|
|
133
|
+
)
|
|
134
|
+
self.assertIs(result, sentinel)
|
|
135
|
+
|
|
136
|
+
def test_chunk_json_alias(self) -> None:
|
|
137
|
+
raw: typing.Dict[str, typing.Any] = {
|
|
138
|
+
"boundingBoxes": [],
|
|
139
|
+
"chunk": "id123",
|
|
140
|
+
"contentType": [],
|
|
141
|
+
"json": [{"foo": "bar"}],
|
|
142
|
+
"multimodalUrl": None,
|
|
143
|
+
"narrative": None,
|
|
144
|
+
"pageNumbers": [],
|
|
145
|
+
"sectionSummary": None,
|
|
146
|
+
"suggestedText": None,
|
|
147
|
+
"text": None,
|
|
148
|
+
}
|
|
149
|
+
chunk = Chunk.model_validate(raw)
|
|
150
|
+
self.assertEqual(chunk.json_, [{"foo": "bar"}])
|
|
151
|
+
|
|
152
|
+
self.assertNotIn("json':", chunk.model_dump_json().replace('"json"', ""))
|
|
153
|
+
|
|
154
|
+
def test_roundtrip_xray_to_models(self):
|
|
155
|
+
payload: dict[str, typing.Any] = {
|
|
156
|
+
"chunks": [
|
|
157
|
+
{
|
|
158
|
+
"boundingBoxes": [
|
|
159
|
+
{
|
|
160
|
+
"bottomRightX": 10.0,
|
|
161
|
+
"bottomRightY": 20.0,
|
|
162
|
+
"topLeftX": 1.0,
|
|
163
|
+
"topLeftY": 2.0,
|
|
164
|
+
"corrected": True,
|
|
165
|
+
"pageNumber": 1,
|
|
166
|
+
}
|
|
167
|
+
],
|
|
168
|
+
"chunk": "foo",
|
|
169
|
+
"contentType": ["paragraph"],
|
|
170
|
+
"json": [{"a": 1}],
|
|
171
|
+
"multimodalUrl": None,
|
|
172
|
+
"narrative": ["narr1"],
|
|
173
|
+
"pageNumbers": [1],
|
|
174
|
+
"sectionSummary": None,
|
|
175
|
+
"suggestedText": None,
|
|
176
|
+
"text": "hello",
|
|
177
|
+
}
|
|
178
|
+
],
|
|
179
|
+
"documentPages": [
|
|
180
|
+
{
|
|
181
|
+
"chunks": [],
|
|
182
|
+
"height": 500,
|
|
183
|
+
"pageNumber": 1,
|
|
184
|
+
"pageUrl": "https://page.jpg",
|
|
185
|
+
"width": 400,
|
|
186
|
+
}
|
|
187
|
+
],
|
|
188
|
+
"sourceUrl": "https://doc.pdf",
|
|
189
|
+
"fileKeywords": "kw",
|
|
190
|
+
"fileName": "file.pdf",
|
|
191
|
+
"fileType": "pdf",
|
|
192
|
+
"fileSummary": "sum",
|
|
193
|
+
"language": "en",
|
|
194
|
+
}
|
|
195
|
+
dummy = self.make_dummy_response(payload=payload, status_ok=True)
|
|
196
|
+
with patch("requests.get", return_value=dummy):
|
|
197
|
+
gx = GD(base_url="", documentID="D", taskID="T")
|
|
198
|
+
xdoc = test_xray(gx)
|
|
199
|
+
|
|
200
|
+
self.assertEqual(xdoc.fileType, "pdf")
|
|
201
|
+
self.assertEqual(xdoc.fileName, "file.pdf")
|
|
202
|
+
self.assertEqual(xdoc.fileKeywords, "kw")
|
|
203
|
+
self.assertEqual(xdoc.language, "en")
|
|
204
|
+
|
|
205
|
+
self.assertEqual(len(xdoc.chunks), 1)
|
|
206
|
+
chunk = xdoc.chunks[0]
|
|
207
|
+
self.assertIsInstance(chunk, Chunk)
|
|
208
|
+
self.assertEqual(chunk.chunk, "foo")
|
|
209
|
+
bb: typing.Optional[BoundingBox] = None
|
|
210
|
+
if chunk.boundingBoxes is not None and len(chunk.boundingBoxes) > 0:
|
|
211
|
+
bb = chunk.boundingBoxes[0]
|
|
212
|
+
self.assertIsInstance(bb, BoundingBox)
|
|
213
|
+
assert bb is not None, "BoundingBox should not be None"
|
|
214
|
+
self.assertTrue(bb.corrected)
|
|
215
|
+
|
|
216
|
+
self.assertEqual(len(xdoc.documentPages), 1)
|
|
217
|
+
page = xdoc.documentPages[0]
|
|
218
|
+
self.assertIsInstance(page, DocumentPage)
|
|
219
|
+
self.assertEqual(page.pageUrl, "https://page.jpg")
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
if __name__ == "__main__":
|
|
223
|
+
unittest.main()
|