groundx 2.6.0__py3-none-any.whl → 2.7.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of groundx might be problematic. Click here for more details.

Files changed (38) hide show
  1. groundx/__init__.py +14 -18
  2. groundx/client.py +3 -3
  3. groundx/core/client_wrapper.py +2 -2
  4. groundx/extract/classes/document.py +33 -16
  5. groundx/extract/classes/groundx.py +37 -18
  6. groundx/extract/services/logging_cfg.py +0 -2
  7. groundx/extract/services/upload.py +1 -6
  8. groundx/extract/services/upload_s3.py +10 -3
  9. groundx/extract/settings/settings.py +51 -9
  10. groundx/extract/settings/test_settings.py +0 -3
  11. groundx/ingest.py +100 -37
  12. groundx/types/__init__.py +10 -14
  13. groundx/types/workflow_detail.py +4 -0
  14. groundx/types/workflow_detail_chunk_strategy.py +5 -0
  15. groundx/types/workflow_prompt.py +1 -3
  16. groundx/types/workflow_prompt_role.py +1 -1
  17. groundx/types/{workflow_steps_doc_summary.py → workflow_request.py} +12 -4
  18. groundx/types/workflow_request_chunk_strategy.py +5 -0
  19. groundx/types/workflow_step.py +11 -4
  20. groundx/types/workflow_step_config.py +33 -0
  21. groundx/types/workflow_step_config_field.py +8 -0
  22. groundx/types/workflow_steps.py +12 -24
  23. groundx/{workflow → workflows}/__init__.py +2 -2
  24. groundx/{workflow → workflows}/client.py +67 -74
  25. groundx/{workflow → workflows}/raw_client.py +30 -23
  26. groundx/workflows/types/__init__.py +7 -0
  27. groundx/{workflow/types/workflow_get_request_id.py → workflows/types/workflows_get_request_id.py} +1 -1
  28. {groundx-2.6.0.dist-info → groundx-2.7.8.dist-info}/METADATA +1 -1
  29. {groundx-2.6.0.dist-info → groundx-2.7.8.dist-info}/RECORD +31 -33
  30. groundx/types/workflow_steps_chunk_instruct.py +0 -24
  31. groundx/types/workflow_steps_chunk_summary.py +0 -26
  32. groundx/types/workflow_steps_doc_keys.py +0 -22
  33. groundx/types/workflow_steps_search_query.py +0 -22
  34. groundx/types/workflow_steps_sect_instruct.py +0 -20
  35. groundx/types/workflow_steps_sect_summary.py +0 -23
  36. groundx/workflow/types/__init__.py +0 -7
  37. {groundx-2.6.0.dist-info → groundx-2.7.8.dist-info}/LICENSE +0 -0
  38. {groundx-2.6.0.dist-info → groundx-2.7.8.dist-info}/WHEEL +0 -0
groundx/__init__.py CHANGED
@@ -52,6 +52,7 @@ from .types import (
52
52
  WebsiteSource,
53
53
  WorkflowApplyRequest,
54
54
  WorkflowDetail,
55
+ WorkflowDetailChunkStrategy,
55
56
  WorkflowDetailRelationships,
56
57
  WorkflowEngine,
57
58
  WorkflowEngineReasoningEffort,
@@ -59,25 +60,22 @@ from .types import (
59
60
  WorkflowPrompt,
60
61
  WorkflowPromptGroup,
61
62
  WorkflowPromptRole,
63
+ WorkflowRequest,
64
+ WorkflowRequestChunkStrategy,
62
65
  WorkflowResponse,
63
66
  WorkflowStep,
67
+ WorkflowStepConfig,
68
+ WorkflowStepConfigField,
64
69
  WorkflowSteps,
65
- WorkflowStepsChunkInstruct,
66
- WorkflowStepsChunkSummary,
67
- WorkflowStepsDocKeys,
68
- WorkflowStepsDocSummary,
69
- WorkflowStepsSearchQuery,
70
- WorkflowStepsSectInstruct,
71
- WorkflowStepsSectSummary,
72
70
  WorkflowsResponse,
73
71
  )
74
72
  from .errors import BadRequestError, UnauthorizedError
75
- from . import buckets, customer, documents, groups, health, search, workflow
73
+ from . import buckets, customer, documents, groups, health, search, workflows
76
74
  from .environment import GroundXEnvironment
77
75
  from .ingest import AsyncGroundX, GroundX
78
76
  from .search import SearchContentRequestId
79
77
  from .version import __version__
80
- from .workflow import WorkflowGetRequestId
78
+ from .workflows import WorkflowsGetRequestId
81
79
 
82
80
  __all__ = [
83
81
  "AsyncGroundX",
@@ -135,24 +133,22 @@ __all__ = [
135
133
  "WebsiteSource",
136
134
  "WorkflowApplyRequest",
137
135
  "WorkflowDetail",
136
+ "WorkflowDetailChunkStrategy",
138
137
  "WorkflowDetailRelationships",
139
138
  "WorkflowEngine",
140
139
  "WorkflowEngineReasoningEffort",
141
140
  "WorkflowEngineService",
142
- "WorkflowGetRequestId",
143
141
  "WorkflowPrompt",
144
142
  "WorkflowPromptGroup",
145
143
  "WorkflowPromptRole",
144
+ "WorkflowRequest",
145
+ "WorkflowRequestChunkStrategy",
146
146
  "WorkflowResponse",
147
147
  "WorkflowStep",
148
+ "WorkflowStepConfig",
149
+ "WorkflowStepConfigField",
148
150
  "WorkflowSteps",
149
- "WorkflowStepsChunkInstruct",
150
- "WorkflowStepsChunkSummary",
151
- "WorkflowStepsDocKeys",
152
- "WorkflowStepsDocSummary",
153
- "WorkflowStepsSearchQuery",
154
- "WorkflowStepsSectInstruct",
155
- "WorkflowStepsSectSummary",
151
+ "WorkflowsGetRequestId",
156
152
  "WorkflowsResponse",
157
153
  "__version__",
158
154
  "buckets",
@@ -161,5 +157,5 @@ __all__ = [
161
157
  "groups",
162
158
  "health",
163
159
  "search",
164
- "workflow",
160
+ "workflows",
165
161
  ]
groundx/client.py CHANGED
@@ -11,7 +11,7 @@ from .environment import GroundXEnvironment
11
11
  from .groups.client import AsyncGroupsClient, GroupsClient
12
12
  from .health.client import AsyncHealthClient, HealthClient
13
13
  from .search.client import AsyncSearchClient, SearchClient
14
- from .workflow.client import AsyncWorkflowClient, WorkflowClient
14
+ from .workflows.client import AsyncWorkflowsClient, WorkflowsClient
15
15
 
16
16
 
17
17
  class GroundXBase:
@@ -78,7 +78,7 @@ class GroundXBase:
78
78
  self.search = SearchClient(client_wrapper=self._client_wrapper)
79
79
  self.buckets = BucketsClient(client_wrapper=self._client_wrapper)
80
80
  self.groups = GroupsClient(client_wrapper=self._client_wrapper)
81
- self.workflow = WorkflowClient(client_wrapper=self._client_wrapper)
81
+ self.workflows = WorkflowsClient(client_wrapper=self._client_wrapper)
82
82
  self.customer = CustomerClient(client_wrapper=self._client_wrapper)
83
83
  self.health = HealthClient(client_wrapper=self._client_wrapper)
84
84
 
@@ -147,7 +147,7 @@ class AsyncGroundXBase:
147
147
  self.search = AsyncSearchClient(client_wrapper=self._client_wrapper)
148
148
  self.buckets = AsyncBucketsClient(client_wrapper=self._client_wrapper)
149
149
  self.groups = AsyncGroupsClient(client_wrapper=self._client_wrapper)
150
- self.workflow = AsyncWorkflowClient(client_wrapper=self._client_wrapper)
150
+ self.workflows = AsyncWorkflowsClient(client_wrapper=self._client_wrapper)
151
151
  self.customer = AsyncCustomerClient(client_wrapper=self._client_wrapper)
152
152
  self.health = AsyncHealthClient(client_wrapper=self._client_wrapper)
153
153
 
@@ -14,10 +14,10 @@ class BaseClientWrapper:
14
14
 
15
15
  def get_headers(self) -> typing.Dict[str, str]:
16
16
  headers: typing.Dict[str, str] = {
17
- "User-Agent": "groundx/2.6.0",
17
+ "User-Agent": "groundx/2.7.8",
18
18
  "X-Fern-Language": "Python",
19
19
  "X-Fern-SDK-Name": "groundx",
20
- "X-Fern-SDK-Version": "2.6.0",
20
+ "X-Fern-SDK-Version": "2.7.8",
21
21
  }
22
22
  headers["X-API-Key"] = self.api_key
23
23
  return headers
@@ -4,9 +4,11 @@ from io import BytesIO
4
4
  from pathlib import Path
5
5
  from PIL import Image
6
6
  from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
7
+ from urllib.parse import urlparse
7
8
 
8
9
  from .groundx import GroundXDocument
9
10
  from ..services.logger import Logger
11
+ from ..services.upload import Upload
10
12
  from ..utility.classes import clean_json
11
13
 
12
14
 
@@ -44,6 +46,7 @@ class Document(BaseModel):
44
46
  base_url: str,
45
47
  cache_dir: Path,
46
48
  req: "DocumentRequest",
49
+ upload: typing.Optional[Upload] = None,
47
50
  **data: typing.Any,
48
51
  ) -> DocT:
49
52
  st = cls(**data)
@@ -56,7 +59,7 @@ class Document(BaseModel):
56
59
  base_url=base_url,
57
60
  documentID=req.document_id,
58
61
  taskID=req.task_id,
59
- ).xray(cache_dir=cache_dir, clear_cache=req.clear_cache)
62
+ ).xray(upload=upload, cache_dir=cache_dir, clear_cache=req.clear_cache)
60
63
 
61
64
  for page in xray_doc.documentPages:
62
65
  st.page_images.append(page.pageUrl)
@@ -250,6 +253,7 @@ class DocumentRequest(BaseModel):
250
253
  def load_images(
251
254
  self,
252
255
  imgs: typing.List[str],
256
+ upload: typing.Optional[Upload] = None,
253
257
  attempt: int = 0,
254
258
  should_sleep: bool = True,
255
259
  ) -> typing.List[Image.Image]:
@@ -261,26 +265,39 @@ class DocumentRequest(BaseModel):
261
265
  f"[{attempt}] loading cached [{self.page_image_dict[page]}] [{page}]",
262
266
  )
263
267
  pageImages.append(self.page_images[self.page_image_dict[page]])
264
- else:
265
- try:
266
- self.print("WARN", f"[{attempt}] downloading [{page}]")
267
- resp = requests.get(page)
268
- resp.raise_for_status()
269
- img = Image.open(BytesIO(resp.content))
268
+ continue
269
+
270
+ if upload:
271
+ parsed = urlparse(page)
272
+ path = parsed.path + ("?" + parsed.query if parsed.query else "")
273
+ ru = upload.get_object(path)
274
+ if ru:
275
+ img = Image.open(BytesIO(ru))
270
276
  if img:
271
277
  self.page_image_dict[page] = len(self.page_images)
272
278
  self.page_images.append(img)
273
279
  pageImages.append(img)
274
- except Exception as e:
275
- self.print(
276
- "ERROR", f"[{attempt}] Failed to load image from {page}: {e}"
280
+ continue
281
+
282
+ try:
283
+ self.print("WARN", f"[{attempt}] downloading [{page}]")
284
+ resp = requests.get(page)
285
+ resp.raise_for_status()
286
+ img = Image.open(BytesIO(resp.content))
287
+ if img:
288
+ self.page_image_dict[page] = len(self.page_images)
289
+ self.page_images.append(img)
290
+ pageImages.append(img)
291
+ except Exception as e:
292
+ self.print(
293
+ "ERROR", f"[{attempt}] Failed to load image from {page}: {e}"
294
+ )
295
+ if attempt < 2:
296
+ if should_sleep:
297
+ time.sleep(2 * attempt + 1)
298
+ return self.load_images(
299
+ imgs, upload, attempt + 1, should_sleep=should_sleep
277
300
  )
278
- if attempt < 2:
279
- if should_sleep:
280
- time.sleep(2 * attempt + 1)
281
- return self.load_images(
282
- imgs, attempt + 1, should_sleep=should_sleep
283
- )
284
301
 
285
302
  return pageImages
286
303
 
@@ -3,6 +3,8 @@ from pathlib import Path
3
3
 
4
4
  from pydantic import BaseModel, ConfigDict, Field
5
5
 
6
+ from ..services.upload import Upload
7
+
6
8
 
7
9
  class GroundXDocument(BaseModel):
8
10
  model_config = ConfigDict(populate_by_name=True)
@@ -10,6 +12,9 @@ class GroundXDocument(BaseModel):
10
12
  document_id: str = Field(alias="documentID")
11
13
  task_id: str = Field(alias="taskID")
12
14
 
15
+ def xray_path(self) -> str:
16
+ return f"layout/processed/{self.task_id}/{self.document_id}-xray.json"
17
+
13
18
  def xray_url(self, base: typing.Optional[str] = None) -> str:
14
19
  if not base:
15
20
  base = self.base_url
@@ -20,6 +25,7 @@ class GroundXDocument(BaseModel):
20
25
  def xray(
21
26
  self,
22
27
  cache_dir: Path,
28
+ upload: typing.Optional[Upload] = None,
23
29
  clear_cache: bool = False,
24
30
  is_test: bool = False,
25
31
  base: typing.Optional[str] = None,
@@ -27,9 +33,10 @@ class GroundXDocument(BaseModel):
27
33
  return XRayDocument.download(
28
34
  self,
29
35
  cache_dir=cache_dir,
30
- base=base,
36
+ upload=upload,
31
37
  clear_cache=clear_cache,
32
38
  is_test=is_test,
39
+ base=base,
33
40
  )
34
41
 
35
42
 
@@ -87,6 +94,7 @@ class XRayDocument(BaseModel):
87
94
  cls,
88
95
  gx_doc: GroundXDocument,
89
96
  cache_dir: Path,
97
+ upload: typing.Optional[Upload] = None,
90
98
  clear_cache: bool = False,
91
99
  is_test: bool = False,
92
100
  base: typing.Optional[str] = None,
@@ -99,30 +107,41 @@ class XRayDocument(BaseModel):
99
107
  with cache_file.open("r", encoding="utf-8") as f:
100
108
  payload = json.load(f)
101
109
 
110
+ return cls(**payload)
102
111
  except Exception as e:
103
112
  raise RuntimeError(
104
113
  f"Error loading cached X-ray JSON from {cache_file}: {e}"
105
114
  )
106
- else:
107
- url = gx_doc.xray_url(base=base)
108
- try:
109
- resp = requests.get(url)
110
- resp.raise_for_status()
111
- except requests.RequestException as e:
112
- raise RuntimeError(f"Error fetching X-ray JSON from {url}: {e}")
113
-
114
- try:
115
- payload = resp.json()
116
- except ValueError as e:
117
- raise RuntimeError(f"Invalid JSON returned from {url}: {e}")
118
115
 
119
- if is_test is False:
116
+ if upload:
117
+ path = gx_doc.xray_path()
118
+ ru = upload.get_object(path)
119
+ if ru:
120
120
  try:
121
- with cache_file.open("w", encoding="utf-8") as f:
122
- json.dump(payload, f)
121
+ payload = json.loads(ru.decode("utf-8"))
122
+ return cls(**payload)
123
123
  except Exception as e:
124
- print(
125
- f"Warning: failed to write X-ray JSON cache to {cache_file}: {e}"
124
+ raise RuntimeError(
125
+ f"Error decoding X-ray JSON bytes from {path}: {e}"
126
126
  )
127
127
 
128
+ url = gx_doc.xray_url(base=base)
129
+ try:
130
+ resp = requests.get(url)
131
+ resp.raise_for_status()
132
+ except requests.RequestException as e:
133
+ raise RuntimeError(f"Error fetching X-ray JSON from {url}: {e}")
134
+
135
+ try:
136
+ payload = resp.json()
137
+ except ValueError as e:
138
+ raise RuntimeError(f"Invalid JSON returned from {url}: {e}")
139
+
140
+ if is_test is False:
141
+ try:
142
+ with cache_file.open("w", encoding="utf-8") as f:
143
+ json.dump(payload, f)
144
+ except Exception as e:
145
+ print(f"Warning: failed to write X-ray JSON cache to {cache_file}: {e}")
146
+
128
147
  return cls(**payload)
@@ -2,8 +2,6 @@ import typing
2
2
 
3
3
 
4
4
  def logging_config(name: str, level: str) -> typing.Dict[str, typing.Any]:
5
- print(level)
6
-
7
5
  return {
8
6
  "version": 1,
9
7
  "disable_existing_loggers": False,
@@ -46,13 +46,8 @@ class Upload:
46
46
  else:
47
47
  raise Exception(f"unsupported upload.type [{self.settings.upload.type}]")
48
48
 
49
- def get_file(self, url: str) -> bytes:
50
- return bytes()
51
-
52
49
  def get_object(self, url: str) -> typing.Optional[bytes]:
53
- self.client.get_object(url)
54
-
55
- return None
50
+ return self.client.get_object(url)
56
51
 
57
52
  def put_object(
58
53
  self,
@@ -25,12 +25,19 @@ class S3Client:
25
25
 
26
26
  def get_object(self, url: str) -> typing.Optional[bytes]:
27
27
  if not self.client:
28
+ print("get_object no client")
28
29
  return None
29
30
 
30
31
  try:
31
- s3_uri_parts = url.replace("s3://", "").split("/")
32
- s3_bucket = s3_uri_parts[0]
33
- s3_key = "/".join(s3_uri_parts[1:])
32
+ if url.startswith("s3://"):
33
+ s3_uri_parts = url.replace("s3://", "").split("/")
34
+ s3_bucket = s3_uri_parts[0]
35
+ s3_key = "/".join(s3_uri_parts[1:])
36
+ else:
37
+ s3_bucket = self.settings.upload.bucket
38
+ s3_key = url
39
+ if url.startswith("/"):
40
+ s3_key = url[1:]
34
41
 
35
42
  response = self.client.get_object(Bucket=s3_bucket, Key=s3_key)
36
43
 
@@ -17,6 +17,8 @@ GX_DEFAULT_REGION: str = "GROUNDX_DEFAULT_REGION"
17
17
  GX_SECRET: str = "GROUNDX_SECRET_ACCESS_KEY"
18
18
  GX_TOKEN: str = "GROUNDX_SESSION_TOKEN"
19
19
  VALID_KEYS: str = "GROUNDX_VALID_API_KEYS"
20
+ GX_ADMIN_API_KEY: str = "GROUNDX_ADMIN_API_KEY"
21
+ GX_ADMIN_USERNAME: str = "GROUNDX_ADMIN_USERNAME"
20
22
 
21
23
 
22
24
  class AgentSettings(BaseModel):
@@ -77,22 +79,54 @@ class ContainerSettings(BaseModel):
77
79
  if key:
78
80
  return key
79
81
 
82
+ key = os.environ.get(GX_ADMIN_USERNAME)
83
+ if key:
84
+ return key
85
+
86
+ key = os.environ.get(GX_ADMIN_API_KEY)
87
+ if key:
88
+ return key
89
+
90
+ key = os.environ.get(GX_API_KEY)
91
+ if key:
92
+ return key
93
+
80
94
  raise Exception(f"you must set a callback_api_key")
81
95
 
82
96
  def get_valid_api_keys(self) -> typing.List[str]:
97
+ keys: typing.List[str] = []
98
+
83
99
  if self.valid_api_keys:
84
- return self.valid_api_keys
100
+ keys = self.valid_api_keys
85
101
 
86
- keys: typing.Optional[str] = os.environ.get(VALID_KEYS)
87
- if not keys:
88
- raise Exception(f"you must set an array of valid_api_keys")
102
+ env_keys: typing.Optional[str] = os.environ.get(VALID_KEYS)
103
+ if env_keys:
104
+ try:
105
+ data: typing.List[str] = json.loads(env_keys)
106
+ keys.extend(data)
107
+ except Exception as e:
108
+ raise Exception(f"you must set an array of valid_api_keys: {e}")
89
109
 
90
- try:
91
- data: typing.List[str] = json.loads(keys)
92
- except Exception as e:
93
- raise Exception(f"you must set an array of valid_api_keys: {e}")
110
+ key = os.environ.get(CALLBACK_KEY)
111
+ if key:
112
+ keys.append(key)
94
113
 
95
- return data
114
+ key = os.environ.get(GX_ADMIN_API_KEY)
115
+ if key:
116
+ keys.append(key)
117
+
118
+ key = os.environ.get(GX_ADMIN_USERNAME)
119
+ if key:
120
+ keys.append(key)
121
+
122
+ key = os.environ.get(GX_API_KEY)
123
+ if key:
124
+ keys.append(key)
125
+
126
+ if len(keys) < 1:
127
+ raise Exception(f"you must set an array of valid_api_keys")
128
+
129
+ return keys
96
130
 
97
131
  def loglevel(self) -> str:
98
132
  return self.log_level.upper()
@@ -163,8 +197,16 @@ class GroundXSettings(BaseModel):
163
197
  if self.api_key:
164
198
  return self.api_key
165
199
 
200
+ key = os.environ.get(GX_ADMIN_USERNAME)
201
+ if key:
202
+ return key
203
+
166
204
  key = os.environ.get(GX_API_KEY)
167
205
  if key:
168
206
  return key
169
207
 
208
+ key = os.environ.get(GX_ADMIN_API_KEY)
209
+ if key:
210
+ return key
211
+
170
212
  raise Exception(f"you must set a valid GroundX api_key")
@@ -45,7 +45,6 @@ class TestAgentSettings(unittest.TestCase):
45
45
  "expect": {
46
46
  "api_base": "http://test.com",
47
47
  "api_key": "mykey",
48
- "api_key_env": "myenv",
49
48
  "max_steps": 4,
50
49
  "model_id": "gpt-5",
51
50
  },
@@ -452,10 +451,8 @@ class TestGroundXSettings(unittest.TestCase):
452
451
  def test(self) -> None:
453
452
  tsts: typing.List[typing.Dict[str, typing.Any]] = [
454
453
  {
455
- "api_key_env": "",
456
454
  "expect": {
457
455
  "api_key": Exception,
458
- "api_key_env": "",
459
456
  "base_url": None,
460
457
  "upload_url": "https://upload.eyelevel.ai",
461
458
  },