groundx 2.4.4__py3-none-any.whl → 2.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of groundx might be problematic. Click here for more details.

Files changed (36) hide show
  1. groundx/core/client_wrapper.py +2 -2
  2. groundx/extract/__init__.py +38 -0
  3. groundx/extract/agents/__init__.py +7 -0
  4. groundx/extract/agents/agent.py +202 -0
  5. groundx/extract/classes/__init__.py +27 -0
  6. groundx/extract/classes/agent.py +22 -0
  7. groundx/extract/classes/api.py +15 -0
  8. groundx/extract/classes/document.py +311 -0
  9. groundx/extract/classes/field.py +88 -0
  10. groundx/extract/classes/groundx.py +123 -0
  11. groundx/extract/classes/post_process.py +33 -0
  12. groundx/extract/classes/prompt.py +36 -0
  13. groundx/extract/classes/settings.py +169 -0
  14. groundx/extract/classes/test_document.py +126 -0
  15. groundx/extract/classes/test_field.py +43 -0
  16. groundx/extract/classes/test_groundx.py +188 -0
  17. groundx/extract/classes/test_prompt.py +68 -0
  18. groundx/extract/classes/test_settings.py +515 -0
  19. groundx/extract/classes/test_utility.py +81 -0
  20. groundx/extract/classes/utility.py +193 -0
  21. groundx/extract/services/.DS_Store +0 -0
  22. groundx/extract/services/__init__.py +14 -0
  23. groundx/extract/services/csv.py +76 -0
  24. groundx/extract/services/logger.py +127 -0
  25. groundx/extract/services/logging_cfg.py +55 -0
  26. groundx/extract/services/ratelimit.py +104 -0
  27. groundx/extract/services/sheets_client.py +160 -0
  28. groundx/extract/services/status.py +197 -0
  29. groundx/extract/services/upload.py +73 -0
  30. groundx/extract/services/upload_minio.py +122 -0
  31. groundx/extract/services/upload_s3.py +84 -0
  32. groundx/extract/services/utility.py +52 -0
  33. {groundx-2.4.4.dist-info → groundx-2.4.9.dist-info}/METADATA +1 -1
  34. {groundx-2.4.4.dist-info → groundx-2.4.9.dist-info}/RECORD +36 -5
  35. {groundx-2.4.4.dist-info → groundx-2.4.9.dist-info}/LICENSE +0 -0
  36. {groundx-2.4.4.dist-info → groundx-2.4.9.dist-info}/WHEEL +0 -0
@@ -0,0 +1,188 @@
1
+ import requests, typing, unittest
2
+ from unittest.mock import patch
3
+
4
+ from pydantic import ValidationError
5
+
6
+ from .groundx import (
7
+ GroundXDocument,
8
+ XRayDocument,
9
+ Chunk,
10
+ BoundingBox,
11
+ DocumentPage,
12
+ )
13
+
14
+
15
+ def GD(**data: typing.Any) -> GroundXDocument:
16
+ return GroundXDocument.model_validate(data)
17
+
18
+
19
+ class TestGroundX(unittest.TestCase):
20
+ def make_dummy_response(
21
+ self,
22
+ payload: typing.Optional[typing.Dict[str, typing.Any]] = None,
23
+ status_ok: bool = True,
24
+ json_error: bool = False,
25
+ ) -> typing.Any:
26
+ class DummyResponse:
27
+ def raise_for_status(self):
28
+ if not status_ok:
29
+ raise requests.HTTPError("HTTP error!")
30
+
31
+ def json(self):
32
+ if json_error:
33
+ raise ValueError("Bad JSON!")
34
+ return payload
35
+
36
+ return DummyResponse()
37
+
38
+ def test_xray_url(self):
39
+ gx = GD(base_url="", documentID="doc123", taskID="taskABC")
40
+ expected = "https://upload.test/layout/processed/taskABC/doc123-xray.json"
41
+ self.assertEqual(gx.xray_url(base="https://upload.test"), expected)
42
+
43
+ def test_download_success(self):
44
+ payload: typing.Dict[str, typing.Any] = {
45
+ "chunks": [],
46
+ "documentPages": [],
47
+ "sourceUrl": "https://example.com/foo.pdf",
48
+ }
49
+ dummy = self.make_dummy_response(payload=payload, status_ok=True)
50
+ with patch("requests.get", return_value=dummy):
51
+ gx = GD(base_url="", documentID="D", taskID="T")
52
+ xdoc = XRayDocument.download(gx, base="https://upload.test", is_test=True)
53
+ self.assertIsInstance(xdoc, XRayDocument)
54
+ self.assertEqual(xdoc.chunks, [])
55
+ self.assertEqual(xdoc.documentPages, [])
56
+ self.assertEqual(xdoc.sourceUrl, payload["sourceUrl"])
57
+
58
+ def test_download_request_exception(self):
59
+ with patch("requests.get", side_effect=requests.RequestException("no network")):
60
+ gx = GD(base_url="", documentID="D", taskID="T")
61
+ with self.assertRaises(RuntimeError) as cm:
62
+ XRayDocument.download(gx, base="https://upload.test", is_test=True)
63
+ self.assertIn("Error fetching X-ray JSON", str(cm.exception))
64
+
65
+ def test_download_http_error(self):
66
+ dummy = self.make_dummy_response(payload={}, status_ok=False)
67
+ with patch("requests.get", return_value=dummy):
68
+ gx = GD(base_url="", documentID="D", taskID="T")
69
+ with self.assertRaises(RuntimeError) as cm:
70
+ XRayDocument.download(gx, base="https://upload.test", is_test=True)
71
+ self.assertIn("HTTP error!", str(cm.exception))
72
+
73
+ def test_download_json_error(self):
74
+ dummy = self.make_dummy_response(payload=None, status_ok=True, json_error=True)
75
+ with patch("requests.get", return_value=dummy):
76
+ gx = GD(base_url="", documentID="D", taskID="T")
77
+ with self.assertRaises(RuntimeError) as cm:
78
+ XRayDocument.download(gx, base="https://upload.test", is_test=True)
79
+ self.assertIn("Invalid JSON returned", str(cm.exception))
80
+
81
+ def test_validation_error_on_missing_required_fields(self) -> None:
82
+ payload: typing.Dict[str, typing.Any] = {
83
+ "documentPages": [],
84
+ "sourceUrl": "https://example.com/foo.pdf",
85
+ }
86
+ dummy = self.make_dummy_response(payload=payload, status_ok=True)
87
+ with patch("requests.get", return_value=dummy):
88
+ gx = GD(base_url="", documentID="D", taskID="T")
89
+ with self.assertRaises(ValidationError) as cm:
90
+ XRayDocument.download(gx, base="https://upload.test", is_test=True)
91
+ self.assertIn("Field required", str(cm.exception))
92
+
93
+ def test_xray_method_delegates_to_download(self) -> None:
94
+ gx = GD(base_url="", documentID="X", taskID="Y")
95
+
96
+ sentinel = object()
97
+ with patch.object(XRayDocument, "download", return_value=sentinel):
98
+ result = gx.xray(base="https://upload.test", is_test=True)
99
+ self.assertIs(result, sentinel)
100
+
101
+ def test_chunk_json_alias(self) -> None:
102
+ raw: typing.Dict[str, typing.Any] = {
103
+ "boundingBoxes": [],
104
+ "chunk": "id123",
105
+ "contentType": [],
106
+ "json": [{"foo": "bar"}],
107
+ "multimodalUrl": None,
108
+ "narrative": None,
109
+ "pageNumbers": [],
110
+ "sectionSummary": None,
111
+ "suggestedText": None,
112
+ "text": None,
113
+ }
114
+ chunk = Chunk.model_validate(raw)
115
+ self.assertEqual(chunk.json_, [{"foo": "bar"}])
116
+
117
+ self.assertNotIn("json':", chunk.model_dump_json().replace('"json"', ""))
118
+
119
+ def test_roundtrip_xray_to_models(self):
120
+ payload: dict[str, typing.Any] = {
121
+ "chunks": [
122
+ {
123
+ "boundingBoxes": [
124
+ {
125
+ "bottomRightX": 10.0,
126
+ "bottomRightY": 20.0,
127
+ "topLeftX": 1.0,
128
+ "topLeftY": 2.0,
129
+ "corrected": True,
130
+ "pageNumber": 1,
131
+ }
132
+ ],
133
+ "chunk": "foo",
134
+ "contentType": ["paragraph"],
135
+ "json": [{"a": 1}],
136
+ "multimodalUrl": None,
137
+ "narrative": ["narr1"],
138
+ "pageNumbers": [1],
139
+ "sectionSummary": None,
140
+ "suggestedText": None,
141
+ "text": "hello",
142
+ }
143
+ ],
144
+ "documentPages": [
145
+ {
146
+ "chunks": [],
147
+ "height": 500,
148
+ "pageNumber": 1,
149
+ "pageUrl": "https://page.jpg",
150
+ "width": 400,
151
+ }
152
+ ],
153
+ "sourceUrl": "https://doc.pdf",
154
+ "fileKeywords": "kw",
155
+ "fileName": "file.pdf",
156
+ "fileType": "pdf",
157
+ "fileSummary": "sum",
158
+ "language": "en",
159
+ }
160
+ dummy = self.make_dummy_response(payload=payload, status_ok=True)
161
+ with patch("requests.get", return_value=dummy):
162
+ gx = GD(base_url="", documentID="D", taskID="T")
163
+ xdoc = XRayDocument.download(gx, base="https://upload.test", is_test=True)
164
+
165
+ self.assertEqual(xdoc.fileType, "pdf")
166
+ self.assertEqual(xdoc.fileName, "file.pdf")
167
+ self.assertEqual(xdoc.fileKeywords, "kw")
168
+ self.assertEqual(xdoc.language, "en")
169
+
170
+ self.assertEqual(len(xdoc.chunks), 1)
171
+ chunk = xdoc.chunks[0]
172
+ self.assertIsInstance(chunk, Chunk)
173
+ self.assertEqual(chunk.chunk, "foo")
174
+ bb: typing.Optional[BoundingBox] = None
175
+ if chunk.boundingBoxes is not None and len(chunk.boundingBoxes) > 0:
176
+ bb = chunk.boundingBoxes[0]
177
+ self.assertIsInstance(bb, BoundingBox)
178
+ assert bb is not None, "BoundingBox should not be None"
179
+ self.assertTrue(bb.corrected)
180
+
181
+ self.assertEqual(len(xdoc.documentPages), 1)
182
+ page = xdoc.documentPages[0]
183
+ self.assertIsInstance(page, DocumentPage)
184
+ self.assertEqual(page.pageUrl, "https://page.jpg")
185
+
186
+
187
+ if __name__ == "__main__":
188
+ unittest.main()
@@ -0,0 +1,68 @@
1
+ import typing, unittest
2
+
3
+ from .prompt import Prompt
4
+
5
+
6
+ def TestPrompt(
7
+ name: str,
8
+ ty: typing.Union[str, typing.List[str]],
9
+ ) -> Prompt:
10
+ return Prompt(
11
+ attr_name=name,
12
+ prompt=name.replace("_", "-"),
13
+ type=ty,
14
+ )
15
+
16
+
17
+ class TestPromptValidValue(unittest.TestCase):
18
+ def test_single_type_str(self):
19
+ p = TestPrompt("field1", "str")
20
+ self.assertTrue(p.valid_value("hello"))
21
+ self.assertFalse(p.valid_value(123))
22
+ self.assertFalse(p.valid_value((1, 2, 3)))
23
+ self.assertFalse(p.valid_value([1, 2, 3]))
24
+
25
+ def test_single_type_int(self):
26
+ p = TestPrompt("field1", "int")
27
+ self.assertFalse(p.valid_value("hello"))
28
+ self.assertTrue(p.valid_value(123))
29
+ self.assertTrue(p.valid_value(12.3))
30
+ self.assertFalse(p.valid_value((1, 2, 3)))
31
+ self.assertFalse(p.valid_value([1, 2, 3]))
32
+
33
+ def test_single_type_float(self):
34
+ p = TestPrompt("field1", "float")
35
+ self.assertFalse(p.valid_value("hello"))
36
+ self.assertTrue(p.valid_value(123))
37
+ self.assertTrue(p.valid_value(12.3))
38
+ self.assertTrue(p.valid_value(123.0))
39
+ self.assertFalse(p.valid_value((1, 2, 3)))
40
+ self.assertFalse(p.valid_value([1, 2, 3]))
41
+
42
+ def test_single_type_list(self):
43
+ p = TestPrompt("field1", "list")
44
+ self.assertFalse(p.valid_value("hello"))
45
+ self.assertFalse(p.valid_value(123))
46
+ self.assertFalse(p.valid_value(12.3))
47
+ self.assertFalse(p.valid_value(123.0))
48
+ self.assertFalse(p.valid_value((1, 2, 3)))
49
+ self.assertTrue(p.valid_value([1, 2, 3]))
50
+
51
+ def test_list_of_types_success_and_failure(self):
52
+ p = TestPrompt("field2", ["str", "float"])
53
+ self.assertTrue(p.valid_value("hello"))
54
+ self.assertTrue(p.valid_value(123))
55
+ self.assertTrue(p.valid_value(12.3))
56
+ self.assertTrue(p.valid_value(123.0))
57
+ self.assertFalse(p.valid_value((1, 2, 3)))
58
+ self.assertFalse(p.valid_value([1, 2, 3]))
59
+
60
+ def test_repr_contains_fields(self):
61
+ p = TestPrompt("field_5", "int")
62
+ rep = repr(p)
63
+ self.assertIn("field_5", rep)
64
+ self.assertIn("field-5", rep)
65
+
66
+
67
+ if __name__ == "__main__":
68
+ unittest.main()