groundx 2.0.15__py3-none-any.whl → 2.7.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. groundx/__init__.py +73 -21
  2. groundx/buckets/__init__.py +2 -0
  3. groundx/buckets/client.py +55 -388
  4. groundx/buckets/raw_client.py +628 -0
  5. groundx/client.py +22 -21
  6. groundx/core/__init__.py +5 -0
  7. groundx/core/api_error.py +13 -5
  8. groundx/core/client_wrapper.py +4 -3
  9. groundx/core/force_multipart.py +16 -0
  10. groundx/core/http_client.py +76 -32
  11. groundx/core/http_response.py +55 -0
  12. groundx/core/jsonable_encoder.py +0 -1
  13. groundx/core/pydantic_utilities.py +71 -112
  14. groundx/core/serialization.py +7 -3
  15. groundx/csv_splitter.py +64 -0
  16. groundx/customer/__init__.py +2 -0
  17. groundx/customer/client.py +31 -43
  18. groundx/customer/raw_client.py +91 -0
  19. groundx/documents/__init__.py +1 -2
  20. groundx/documents/client.py +455 -953
  21. groundx/documents/raw_client.py +1450 -0
  22. groundx/errors/__init__.py +2 -0
  23. groundx/errors/bad_request_error.py +4 -3
  24. groundx/errors/unauthorized_error.py +4 -3
  25. groundx/extract/__init__.py +48 -0
  26. groundx/extract/agents/__init__.py +7 -0
  27. groundx/extract/agents/agent.py +202 -0
  28. groundx/extract/classes/__init__.py +24 -0
  29. groundx/extract/classes/agent.py +23 -0
  30. groundx/extract/classes/api.py +15 -0
  31. groundx/extract/classes/document.py +338 -0
  32. groundx/extract/classes/field.py +88 -0
  33. groundx/extract/classes/groundx.py +147 -0
  34. groundx/extract/classes/prompt.py +36 -0
  35. groundx/extract/classes/test_document.py +109 -0
  36. groundx/extract/classes/test_field.py +43 -0
  37. groundx/extract/classes/test_groundx.py +223 -0
  38. groundx/extract/classes/test_prompt.py +68 -0
  39. groundx/extract/post_process/__init__.py +7 -0
  40. groundx/extract/post_process/post_process.py +33 -0
  41. groundx/extract/services/.DS_Store +0 -0
  42. groundx/extract/services/__init__.py +14 -0
  43. groundx/extract/services/csv.py +76 -0
  44. groundx/extract/services/logger.py +126 -0
  45. groundx/extract/services/logging_cfg.py +53 -0
  46. groundx/extract/services/ratelimit.py +104 -0
  47. groundx/extract/services/sheets_client.py +160 -0
  48. groundx/extract/services/status.py +197 -0
  49. groundx/extract/services/upload.py +68 -0
  50. groundx/extract/services/upload_minio.py +122 -0
  51. groundx/extract/services/upload_s3.py +91 -0
  52. groundx/extract/services/utility.py +52 -0
  53. groundx/extract/settings/__init__.py +15 -0
  54. groundx/extract/settings/settings.py +212 -0
  55. groundx/extract/settings/test_settings.py +512 -0
  56. groundx/extract/tasks/__init__.py +6 -0
  57. groundx/extract/tasks/utility.py +27 -0
  58. groundx/extract/utility/__init__.py +15 -0
  59. groundx/extract/utility/classes.py +193 -0
  60. groundx/extract/utility/test_utility.py +81 -0
  61. groundx/groups/__init__.py +2 -0
  62. groundx/groups/client.py +63 -550
  63. groundx/groups/raw_client.py +901 -0
  64. groundx/health/__init__.py +2 -0
  65. groundx/health/client.py +35 -101
  66. groundx/health/raw_client.py +193 -0
  67. groundx/ingest.py +771 -0
  68. groundx/search/__init__.py +2 -0
  69. groundx/search/client.py +94 -227
  70. groundx/search/raw_client.py +442 -0
  71. groundx/search/types/__init__.py +2 -0
  72. groundx/types/__init__.py +68 -16
  73. groundx/types/bounding_box_detail.py +4 -4
  74. groundx/types/bucket_detail.py +5 -5
  75. groundx/types/bucket_list_response.py +17 -3
  76. groundx/types/bucket_response.py +3 -3
  77. groundx/types/bucket_update_detail.py +4 -4
  78. groundx/types/bucket_update_response.py +3 -3
  79. groundx/types/customer_detail.py +2 -2
  80. groundx/types/customer_response.py +3 -3
  81. groundx/types/document.py +54 -0
  82. groundx/types/document_detail.py +16 -4
  83. groundx/types/document_list_response.py +4 -4
  84. groundx/types/document_local_ingest_request.py +7 -0
  85. groundx/types/document_lookup_response.py +8 -3
  86. groundx/types/document_response.py +3 -3
  87. groundx/types/document_type.py +21 -1
  88. groundx/types/group_detail.py +4 -4
  89. groundx/types/group_list_response.py +17 -3
  90. groundx/types/group_response.py +3 -3
  91. groundx/types/health_response.py +3 -3
  92. groundx/types/health_response_health.py +3 -3
  93. groundx/types/health_service.py +5 -5
  94. groundx/types/ingest_local_document.py +25 -0
  95. groundx/types/ingest_local_document_metadata.py +51 -0
  96. groundx/types/ingest_remote_document.py +15 -6
  97. groundx/types/ingest_response.py +4 -4
  98. groundx/types/{process_status_response_ingest.py → ingest_status.py} +8 -7
  99. groundx/types/{ingest_response_ingest.py → ingest_status_light.py} +7 -5
  100. groundx/types/ingest_status_progress.py +26 -0
  101. groundx/types/{process_status_response_ingest_progress_errors.py → ingest_status_progress_cancelled.py} +4 -4
  102. groundx/types/{process_status_response_ingest_progress_complete.py → ingest_status_progress_complete.py} +4 -4
  103. groundx/types/{process_status_response_ingest_progress_cancelled.py → ingest_status_progress_errors.py} +4 -4
  104. groundx/types/{process_status_response_ingest_progress_processing.py → ingest_status_progress_processing.py} +4 -4
  105. groundx/types/message_response.py +2 -2
  106. groundx/types/meter_detail.py +2 -2
  107. groundx/types/process_level.py +5 -0
  108. groundx/types/{process_status_response.py → processes_status_response.py} +8 -5
  109. groundx/types/processing_status.py +3 -1
  110. groundx/types/search_response.py +3 -3
  111. groundx/types/search_response_search.py +3 -3
  112. groundx/types/search_result_item.py +7 -5
  113. groundx/types/search_result_item_pages_item.py +41 -0
  114. groundx/types/subscription_detail.py +3 -3
  115. groundx/types/subscription_detail_meters.py +5 -5
  116. groundx/{documents/types/website_crawl_request_websites_item.py → types/website_source.py} +7 -7
  117. groundx/types/workflow_apply_request.py +24 -0
  118. groundx/types/workflow_detail.py +59 -0
  119. groundx/types/workflow_detail_chunk_strategy.py +5 -0
  120. groundx/types/workflow_detail_relationships.py +36 -0
  121. groundx/types/workflow_engine.py +58 -0
  122. groundx/types/workflow_engine_reasoning_effort.py +5 -0
  123. groundx/types/workflow_engine_service.py +7 -0
  124. groundx/types/workflow_prompt.py +37 -0
  125. groundx/types/workflow_prompt_group.py +25 -0
  126. groundx/types/workflow_prompt_role.py +5 -0
  127. groundx/types/workflow_request.py +31 -0
  128. groundx/types/workflow_request_chunk_strategy.py +5 -0
  129. groundx/types/workflow_response.py +20 -0
  130. groundx/types/workflow_step.py +33 -0
  131. groundx/types/workflow_step_config.py +33 -0
  132. groundx/types/workflow_step_config_field.py +8 -0
  133. groundx/types/workflow_steps.py +38 -0
  134. groundx/types/workflows_response.py +20 -0
  135. groundx/workflows/__init__.py +7 -0
  136. groundx/workflows/client.py +736 -0
  137. groundx/workflows/raw_client.py +841 -0
  138. groundx/workflows/types/__init__.py +7 -0
  139. groundx/workflows/types/workflows_get_request_id.py +5 -0
  140. {groundx-2.0.15.dist-info → groundx-2.7.7.dist-info}/LICENSE +1 -1
  141. {groundx-2.0.15.dist-info → groundx-2.7.7.dist-info}/METADATA +39 -22
  142. groundx-2.7.7.dist-info/RECORD +155 -0
  143. groundx/documents/types/__init__.py +0 -6
  144. groundx/documents/types/documents_ingest_local_request_files_item.py +0 -43
  145. groundx/types/process_status_response_ingest_progress.py +0 -26
  146. groundx-2.0.15.dist-info/RECORD +0 -82
  147. {groundx-2.0.15.dist-info → groundx-2.7.7.dist-info}/WHEEL +0 -0
@@ -0,0 +1,88 @@
1
+ import dateparser, typing
2
+
3
+ from pydantic import BaseModel
4
+
5
+
6
+ class ExtractedField(BaseModel):
7
+ confidence: typing.Optional[str] = None
8
+ conflicts: typing.List[typing.Any] = []
9
+ key: str
10
+
11
+ value: typing.Union[str, float, typing.List[typing.Any]] = ""
12
+
13
+ def __init__(
14
+ self,
15
+ value: typing.Union[str, float, typing.List[typing.Any]],
16
+ **data: typing.Any,
17
+ ):
18
+ super().__init__(**data)
19
+
20
+ self.set_value(value)
21
+
22
+ def contains(self, other: "ExtractedField") -> bool:
23
+ self_val = self.get_value()
24
+ other_val = other.get_value()
25
+ if not (isinstance(self_val, (str, float, int))):
26
+ raise Exception(f"unexpected self field value type [{type(self_val)}]")
27
+
28
+ if self.equal_to_value(other_val):
29
+ return True
30
+
31
+ if other_val in self.conflicts:
32
+ return True
33
+
34
+ return False
35
+
36
+ def equal_to_field(self, other: "ExtractedField") -> bool:
37
+ self_val = self.get_value()
38
+ other_val = other.get_value()
39
+ if not (isinstance(self_val, (str, float, int))):
40
+ raise Exception(f"unexpected self field value type [{type(self_val)}]")
41
+
42
+ return self.equal_to_value(other_val)
43
+
44
+ def equal_to_value(self, other: typing.Any) -> bool:
45
+ if not (isinstance(other, (str, float, int))):
46
+ raise Exception(f"unexpected value type [{type(other)}]")
47
+
48
+ exist = self.get_value()
49
+ if isinstance(exist, int):
50
+ exist = float(exist)
51
+ if isinstance(other, int):
52
+ other = float(other)
53
+ if isinstance(exist, str):
54
+ exist = exist.lower()
55
+ if isinstance(other, str):
56
+ other = other.lower()
57
+
58
+ return type(other) == type(exist) and other == exist
59
+
60
+ def get_value(self) -> typing.Union[str, float, typing.List[typing.Any]]:
61
+ return self.value
62
+
63
+ def remove_conflict(self, value: typing.Any) -> None:
64
+ if value in self.conflicts:
65
+ self.conflicts.remove(value)
66
+ if not self.equal_to_value(value):
67
+ self.conflicts.append(self.get_value())
68
+
69
+ def set_value(
70
+ self, value: typing.Union[str, float, typing.List[typing.Any]]
71
+ ) -> None:
72
+ if isinstance(value, int):
73
+ self.value = float(value)
74
+ elif isinstance(value, str) and "date" in self.key.lower():
75
+ try:
76
+ dt = dateparser.parse(value)
77
+ if dt is None:
78
+ self.value = value
79
+ else:
80
+ self.value = dt.strftime("%Y-%m-%d")
81
+ except Exception as e:
82
+ print(f"date error [{value}]: [{e}]")
83
+ self.value = value
84
+ else:
85
+ self.value = value
86
+
87
+
88
+ ExtractedField.model_rebuild()
@@ -0,0 +1,147 @@
1
+ import json, requests, typing
2
+ from pathlib import Path
3
+
4
+ from pydantic import BaseModel, ConfigDict, Field
5
+
6
+ from ..services.upload import Upload
7
+
8
+
9
+ class GroundXDocument(BaseModel):
10
+ model_config = ConfigDict(populate_by_name=True)
11
+ base_url: str
12
+ document_id: str = Field(alias="documentID")
13
+ task_id: str = Field(alias="taskID")
14
+
15
+ def xray_path(self) -> str:
16
+ return f"layout/processed/{self.task_id}/{self.document_id}-xray.json"
17
+
18
+ def xray_url(self, base: typing.Optional[str] = None) -> str:
19
+ if not base:
20
+ base = self.base_url
21
+ if base.endswith("/"):
22
+ base = base[:-1]
23
+ return f"{base}/layout/processed/{self.task_id}/{self.document_id}-xray.json"
24
+
25
+ def xray(
26
+ self,
27
+ cache_dir: Path,
28
+ upload: typing.Optional[Upload] = None,
29
+ clear_cache: bool = False,
30
+ is_test: bool = False,
31
+ base: typing.Optional[str] = None,
32
+ ) -> "XRayDocument":
33
+ return XRayDocument.download(
34
+ self,
35
+ cache_dir=cache_dir,
36
+ upload=upload,
37
+ clear_cache=clear_cache,
38
+ is_test=is_test,
39
+ base=base,
40
+ )
41
+
42
+
43
+ class GroundXResponse(BaseModel):
44
+ code: int
45
+ document_id: str = Field(alias="documentID")
46
+ model_id: int = Field(alias="modelID")
47
+ processor_id: int = Field(alias="processorID")
48
+ result_url: str = Field(alias="resultURL")
49
+ task_id: str = Field(alias="taskID")
50
+
51
+
52
+ class BoundingBox(BaseModel):
53
+ bottomRightX: float
54
+ bottomRightY: float
55
+ topLeftX: float
56
+ topLeftY: float
57
+ corrected: typing.Optional[bool]
58
+ pageNumber: typing.Optional[int]
59
+
60
+
61
+ class Chunk(BaseModel):
62
+ boundingBoxes: typing.Optional[typing.List[BoundingBox]] = []
63
+ chunk: typing.Optional[str] = None
64
+ contentType: typing.Optional[typing.List[str]] = []
65
+ json_: typing.Optional[typing.List[typing.Any]] = Field(None, alias="json")
66
+ multimodalUrl: typing.Optional[str] = None
67
+ narrative: typing.Optional[typing.List[str]] = None
68
+ pageNumbers: typing.Optional[typing.List[int]] = []
69
+ sectionSummary: typing.Optional[str] = None
70
+ suggestedText: typing.Optional[str] = None
71
+ text: typing.Optional[str] = None
72
+
73
+
74
+ class DocumentPage(BaseModel):
75
+ chunks: typing.List[Chunk]
76
+ height: float
77
+ pageNumber: int
78
+ pageUrl: str
79
+ width: float
80
+
81
+
82
+ class XRayDocument(BaseModel):
83
+ chunks: typing.List[Chunk]
84
+ documentPages: typing.List[DocumentPage] = []
85
+ sourceUrl: str
86
+ fileKeywords: typing.Optional[str] = None
87
+ fileName: typing.Optional[str] = None
88
+ fileType: typing.Optional[str] = None
89
+ fileSummary: typing.Optional[str] = None
90
+ language: typing.Optional[str] = None
91
+
92
+ @classmethod
93
+ def download(
94
+ cls,
95
+ gx_doc: GroundXDocument,
96
+ cache_dir: Path,
97
+ upload: typing.Optional[Upload] = None,
98
+ clear_cache: bool = False,
99
+ is_test: bool = False,
100
+ base: typing.Optional[str] = None,
101
+ ) -> "XRayDocument":
102
+ cache_dir.mkdir(parents=True, exist_ok=True)
103
+ cache_file = cache_dir / f"{gx_doc.document_id}-xray.json"
104
+
105
+ if not clear_cache and cache_file.exists():
106
+ try:
107
+ with cache_file.open("r", encoding="utf-8") as f:
108
+ payload = json.load(f)
109
+
110
+ return cls(**payload)
111
+ except Exception as e:
112
+ raise RuntimeError(
113
+ f"Error loading cached X-ray JSON from {cache_file}: {e}"
114
+ )
115
+
116
+ if upload:
117
+ path = gx_doc.xray_path()
118
+ ru = upload.get_object(path)
119
+ if ru:
120
+ try:
121
+ payload = json.loads(ru.decode("utf-8"))
122
+ return cls(**payload)
123
+ except Exception as e:
124
+ raise RuntimeError(
125
+ f"Error decoding X-ray JSON bytes from {path}: {e}"
126
+ )
127
+
128
+ url = gx_doc.xray_url(base=base)
129
+ try:
130
+ resp = requests.get(url)
131
+ resp.raise_for_status()
132
+ except requests.RequestException as e:
133
+ raise RuntimeError(f"Error fetching X-ray JSON from {url}: {e}")
134
+
135
+ try:
136
+ payload = resp.json()
137
+ except ValueError as e:
138
+ raise RuntimeError(f"Invalid JSON returned from {url}: {e}")
139
+
140
+ if is_test is False:
141
+ try:
142
+ with cache_file.open("w", encoding="utf-8") as f:
143
+ json.dump(payload, f)
144
+ except Exception as e:
145
+ print(f"Warning: failed to write X-ray JSON cache to {cache_file}: {e}")
146
+
147
+ return cls(**payload)
@@ -0,0 +1,36 @@
1
+ import typing
2
+
3
+ from pydantic import BaseModel
4
+
5
+ from ..utility.classes import str_to_type_sequence
6
+
7
+
8
+ class Prompt(BaseModel):
9
+ attr_name: str
10
+ prompt: str
11
+ type: typing.Union[str, typing.List[str]]
12
+
13
+ class Config:
14
+ validate_by_name = True
15
+
16
+ def valid_value(self, value: typing.Any) -> bool:
17
+ ty = self.type
18
+
19
+ types: typing.List[typing.Type[typing.Any]] = []
20
+ if isinstance(ty, list):
21
+ for t in ty:
22
+ if t == "int" or t == "float":
23
+ types.extend([int, float])
24
+ elif t == "str":
25
+ types.append(str)
26
+
27
+ return isinstance(value, tuple(types))
28
+
29
+ exp = str_to_type_sequence(ty)
30
+ for et in exp:
31
+ if et in (int, float):
32
+ types.extend([int, float])
33
+ else:
34
+ types.append(et)
35
+ types = list(dict.fromkeys(types))
36
+ return isinstance(value, tuple(types))
@@ -0,0 +1,109 @@
1
+ import pytest, typing, unittest
2
+
3
+ pytest.importorskip("PIL")
4
+
5
+ from io import BytesIO
6
+ from pathlib import Path
7
+ from PIL import Image
8
+ from unittest.mock import patch
9
+
10
+ from .document import Document, DocumentRequest
11
+ from .test_groundx import TestXRay
12
+
13
+
14
+ def DR(**data: typing.Any) -> DocumentRequest:
15
+ return DocumentRequest.model_validate(data)
16
+
17
+
18
+ def test_doc() -> Document:
19
+ return Document.from_request(
20
+ cache_dir=Path("./cache"),
21
+ base_url="",
22
+ req=test_request(),
23
+ )
24
+
25
+
26
+ def test_request() -> DocumentRequest:
27
+ return DR(documentID="D", fileName="F", modelID=1, processorID=1, taskID="T")
28
+
29
+
30
+ class TestDocument(unittest.TestCase):
31
+ def setUp(self) -> None:
32
+ patcher = patch(
33
+ "groundx.extract.classes.document.GroundXDocument.xray", autospec=True
34
+ )
35
+ self.mock_xray = patcher.start()
36
+ self.addCleanup(patcher.stop)
37
+ self.mock_xray.return_value = TestXRay("http://test.co", [])
38
+
39
+ def test_init_name(self) -> None:
40
+ st1: Document = test_doc()
41
+ self.assertEqual(st1.file_name, "F")
42
+ st2: Document = Document.from_request(
43
+ cache_dir=Path("./cache"),
44
+ base_url="",
45
+ req=DR(
46
+ documentID="D", fileName="F.pdf", modelID=1, processorID=1, taskID="T"
47
+ ),
48
+ )
49
+ self.assertEqual(st2.file_name, "F.pdf")
50
+ st3: Document = Document.from_request(
51
+ cache_dir=Path("./cache"),
52
+ base_url="",
53
+ req=DR(documentID="D", fileName="F.", modelID=1, processorID=1, taskID="T"),
54
+ )
55
+ self.assertEqual(st3.file_name, "F.")
56
+
57
+
58
+ class TestDocumentRequest(unittest.TestCase):
59
+ def test_load_images_cached(self) -> None:
60
+ urls: typing.List[str] = [
61
+ "http://example.com/page1.png",
62
+ "http://example.com/page2.png",
63
+ ]
64
+
65
+ red_img = Image.new("RGB", (10, 10), color="red")
66
+ buf = BytesIO()
67
+ red_img.save(buf, format="PNG")
68
+
69
+ st = test_request()
70
+ st.page_images = [red_img, red_img]
71
+ st.page_image_dict = {
72
+ urls[0]: 0,
73
+ urls[1]: 1,
74
+ }
75
+ st.load_images(urls)
76
+ self.assertEqual(len(st.page_images), 2)
77
+ self.assertEqual(len(st.page_image_dict), 2)
78
+
79
+ def test_load_images_download(self) -> None:
80
+ urls = ["http://example.com/page1.png", "http://example.com/page2.png"]
81
+
82
+ red_img = Image.new("RGB", (10, 10), color="red")
83
+ buf = BytesIO()
84
+ red_img.save(buf, format="PNG")
85
+ img_bytes = buf.getvalue()
86
+
87
+ class TestResp:
88
+ content = img_bytes
89
+
90
+ def raise_for_status(self) -> None:
91
+ pass
92
+
93
+ with patch("requests.get", return_value=TestResp()):
94
+ st = test_request()
95
+ st.load_images(urls)
96
+
97
+ self.assertEqual(len(st.page_images), 2)
98
+ self.assertEqual(len(st.page_image_dict), 2)
99
+ for img in st.page_images:
100
+ self.assertIsInstance(img, Image.Image)
101
+ self.assertEqual(img.size, (10, 10))
102
+
103
+ def test_load_images_error(self) -> None:
104
+ urls = ["http://example.com/page1.png", "http://example.com/page2.png"]
105
+
106
+ st = test_request()
107
+ st.load_images(urls, should_sleep=False)
108
+ self.assertEqual(len(st.page_images), 0)
109
+ self.assertEqual(len(st.page_image_dict), 0)
@@ -0,0 +1,43 @@
1
+ import pytest, typing, unittest
2
+
3
+ pytest.importorskip("dateparser")
4
+
5
+ from .field import ExtractedField
6
+
7
+
8
+ def TestField(
9
+ name: str,
10
+ value: typing.Union[str, float, typing.List[typing.Any]],
11
+ conflicts: typing.List[typing.Any] = [],
12
+ ) -> ExtractedField:
13
+ return ExtractedField(
14
+ key=name.replace("_", " "),
15
+ value=value,
16
+ conflicts=conflicts,
17
+ )
18
+
19
+
20
+ class TestExtractedField(unittest.TestCase):
21
+ def test_equalToValue_string(self):
22
+ ef = TestField("test", "hello")
23
+ self.assertTrue(ef.equal_to_value("hello"))
24
+ self.assertFalse(ef.equal_to_value("world"))
25
+
26
+ def test_equalToValue_int_float_equivalence(self):
27
+ ef = TestField("test", int(10))
28
+ self.assertTrue(ef.equal_to_value(10.0))
29
+ self.assertTrue(ef.equal_to_value(10))
30
+
31
+ def test_equalToValue_mismatch(self):
32
+ ef = TestField("test", 3.14)
33
+ self.assertFalse(ef.equal_to_value(2.71))
34
+
35
+ def test_set_value_dates(self):
36
+ ef1 = TestField("test date", "3/29/25")
37
+ self.assertEqual(ef1.get_value(), "2025-03-29")
38
+ ef2 = TestField("test date", "2025-03-29")
39
+ self.assertEqual(ef2.get_value(), "2025-03-29")
40
+
41
+
42
+ if __name__ == "__main__":
43
+ unittest.main()
@@ -0,0 +1,223 @@
1
+ import requests, typing, unittest
2
+ from pathlib import Path
3
+ from unittest.mock import patch
4
+
5
+ from pydantic import ValidationError
6
+
7
+ from .groundx import (
8
+ GroundXDocument,
9
+ XRayDocument,
10
+ Chunk,
11
+ BoundingBox,
12
+ DocumentPage,
13
+ )
14
+
15
+
16
+ class TestChunk:
17
+ def __init__(self, json_str: str):
18
+ self.sectionSummary = None
19
+ self.suggestedText = json_str
20
+
21
+
22
+ class TestDocumentPage:
23
+ def __init__(self, page_url: str):
24
+ self.pageUrl = page_url
25
+
26
+
27
+ class TestXRay:
28
+ def __init__(
29
+ self,
30
+ source_url: str,
31
+ chunks: typing.Optional[typing.List[TestChunk]] = [],
32
+ document_pages: typing.Optional[typing.List[str]] = [],
33
+ ):
34
+ self.chunks = chunks
35
+ self.documentPages: typing.List[TestDocumentPage] = []
36
+ if document_pages is not None:
37
+ for p in document_pages:
38
+ self.documentPages.append(TestDocumentPage(p))
39
+ self.sourceUrl = source_url
40
+
41
+
42
+ def GD(**data: typing.Any) -> GroundXDocument:
43
+ return GroundXDocument.model_validate(data)
44
+
45
+
46
+ def test_xray(gx: GroundXDocument) -> XRayDocument:
47
+ return XRayDocument.download(
48
+ gx, cache_dir=Path("./cache"), base="https://upload.test", is_test=True
49
+ )
50
+
51
+
52
+ class TestGroundX(unittest.TestCase):
53
+ def make_dummy_response(
54
+ self,
55
+ payload: typing.Optional[typing.Dict[str, typing.Any]] = None,
56
+ status_ok: bool = True,
57
+ json_error: bool = False,
58
+ ) -> typing.Any:
59
+ class DummyResponse:
60
+ def raise_for_status(self):
61
+ if not status_ok:
62
+ raise requests.HTTPError("HTTP error!")
63
+
64
+ def json(self):
65
+ if json_error:
66
+ raise ValueError("Bad JSON!")
67
+ return payload
68
+
69
+ return DummyResponse()
70
+
71
+ def test_xray_url(self):
72
+ gx = GD(base_url="", documentID="doc123", taskID="taskABC")
73
+ expected = "https://upload.test/layout/processed/taskABC/doc123-xray.json"
74
+ self.assertEqual(gx.xray_url(base="https://upload.test"), expected)
75
+
76
+ def test_download_success(self):
77
+ payload: typing.Dict[str, typing.Any] = {
78
+ "chunks": [],
79
+ "documentPages": [],
80
+ "sourceUrl": "https://example.com/foo.pdf",
81
+ }
82
+ dummy = self.make_dummy_response(payload=payload, status_ok=True)
83
+ with patch("requests.get", return_value=dummy):
84
+ gx = GD(base_url="", documentID="D", taskID="T")
85
+ xdoc = test_xray(gx)
86
+ self.assertIsInstance(xdoc, XRayDocument)
87
+ self.assertEqual(xdoc.chunks, [])
88
+ self.assertEqual(xdoc.documentPages, [])
89
+ self.assertEqual(xdoc.sourceUrl, payload["sourceUrl"])
90
+
91
+ def test_download_request_exception(self):
92
+ with patch("requests.get", side_effect=requests.RequestException("no network")):
93
+ gx = GD(base_url="", documentID="D", taskID="T")
94
+ with self.assertRaises(RuntimeError) as cm:
95
+ test_xray(gx)
96
+ self.assertIn("Error fetching X-ray JSON", str(cm.exception))
97
+
98
+ def test_download_http_error(self):
99
+ dummy = self.make_dummy_response(payload={}, status_ok=False)
100
+ with patch("requests.get", return_value=dummy):
101
+ gx = GD(base_url="", documentID="D", taskID="T")
102
+ with self.assertRaises(RuntimeError) as cm:
103
+ test_xray(gx)
104
+ self.assertIn("HTTP error!", str(cm.exception))
105
+
106
+ def test_download_json_error(self):
107
+ dummy = self.make_dummy_response(payload=None, status_ok=True, json_error=True)
108
+ with patch("requests.get", return_value=dummy):
109
+ gx = GD(base_url="", documentID="D", taskID="T")
110
+ with self.assertRaises(RuntimeError) as cm:
111
+ test_xray(gx)
112
+ self.assertIn("Invalid JSON returned", str(cm.exception))
113
+
114
+ def test_validation_error_on_missing_required_fields(self) -> None:
115
+ payload: typing.Dict[str, typing.Any] = {
116
+ "documentPages": [],
117
+ "sourceUrl": "https://example.com/foo.pdf",
118
+ }
119
+ dummy = self.make_dummy_response(payload=payload, status_ok=True)
120
+ with patch("requests.get", return_value=dummy):
121
+ gx = GD(base_url="", documentID="D", taskID="T")
122
+ with self.assertRaises(ValidationError) as cm:
123
+ test_xray(gx)
124
+ self.assertIn("Field required", str(cm.exception))
125
+
126
+ def test_xray_method_delegates_to_download(self) -> None:
127
+ gx = GD(base_url="", documentID="X", taskID="Y")
128
+
129
+ sentinel = object()
130
+ with patch.object(XRayDocument, "download", return_value=sentinel):
131
+ result = gx.xray(
132
+ cache_dir=Path("./cache"), base="https://upload.test", is_test=True
133
+ )
134
+ self.assertIs(result, sentinel)
135
+
136
+ def test_chunk_json_alias(self) -> None:
137
+ raw: typing.Dict[str, typing.Any] = {
138
+ "boundingBoxes": [],
139
+ "chunk": "id123",
140
+ "contentType": [],
141
+ "json": [{"foo": "bar"}],
142
+ "multimodalUrl": None,
143
+ "narrative": None,
144
+ "pageNumbers": [],
145
+ "sectionSummary": None,
146
+ "suggestedText": None,
147
+ "text": None,
148
+ }
149
+ chunk = Chunk.model_validate(raw)
150
+ self.assertEqual(chunk.json_, [{"foo": "bar"}])
151
+
152
+ self.assertNotIn("json':", chunk.model_dump_json().replace('"json"', ""))
153
+
154
+ def test_roundtrip_xray_to_models(self):
155
+ payload: dict[str, typing.Any] = {
156
+ "chunks": [
157
+ {
158
+ "boundingBoxes": [
159
+ {
160
+ "bottomRightX": 10.0,
161
+ "bottomRightY": 20.0,
162
+ "topLeftX": 1.0,
163
+ "topLeftY": 2.0,
164
+ "corrected": True,
165
+ "pageNumber": 1,
166
+ }
167
+ ],
168
+ "chunk": "foo",
169
+ "contentType": ["paragraph"],
170
+ "json": [{"a": 1}],
171
+ "multimodalUrl": None,
172
+ "narrative": ["narr1"],
173
+ "pageNumbers": [1],
174
+ "sectionSummary": None,
175
+ "suggestedText": None,
176
+ "text": "hello",
177
+ }
178
+ ],
179
+ "documentPages": [
180
+ {
181
+ "chunks": [],
182
+ "height": 500,
183
+ "pageNumber": 1,
184
+ "pageUrl": "https://page.jpg",
185
+ "width": 400,
186
+ }
187
+ ],
188
+ "sourceUrl": "https://doc.pdf",
189
+ "fileKeywords": "kw",
190
+ "fileName": "file.pdf",
191
+ "fileType": "pdf",
192
+ "fileSummary": "sum",
193
+ "language": "en",
194
+ }
195
+ dummy = self.make_dummy_response(payload=payload, status_ok=True)
196
+ with patch("requests.get", return_value=dummy):
197
+ gx = GD(base_url="", documentID="D", taskID="T")
198
+ xdoc = test_xray(gx)
199
+
200
+ self.assertEqual(xdoc.fileType, "pdf")
201
+ self.assertEqual(xdoc.fileName, "file.pdf")
202
+ self.assertEqual(xdoc.fileKeywords, "kw")
203
+ self.assertEqual(xdoc.language, "en")
204
+
205
+ self.assertEqual(len(xdoc.chunks), 1)
206
+ chunk = xdoc.chunks[0]
207
+ self.assertIsInstance(chunk, Chunk)
208
+ self.assertEqual(chunk.chunk, "foo")
209
+ bb: typing.Optional[BoundingBox] = None
210
+ if chunk.boundingBoxes is not None and len(chunk.boundingBoxes) > 0:
211
+ bb = chunk.boundingBoxes[0]
212
+ self.assertIsInstance(bb, BoundingBox)
213
+ assert bb is not None, "BoundingBox should not be None"
214
+ self.assertTrue(bb.corrected)
215
+
216
+ self.assertEqual(len(xdoc.documentPages), 1)
217
+ page = xdoc.documentPages[0]
218
+ self.assertIsInstance(page, DocumentPage)
219
+ self.assertEqual(page.pageUrl, "https://page.jpg")
220
+
221
+
222
+ if __name__ == "__main__":
223
+ unittest.main()