groundx 2.2.8__py3-none-any.whl → 2.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,7 +16,7 @@ class BaseClientWrapper:
16
16
  headers: typing.Dict[str, str] = {
17
17
  "X-Fern-Language": "Python",
18
18
  "X-Fern-SDK-Name": "groundx",
19
- "X-Fern-SDK-Version": "2.2.8",
19
+ "X-Fern-SDK-Version": "2.2.9",
20
20
  }
21
21
  headers["X-API-Key"] = self.api_key
22
22
  return headers
@@ -0,0 +1,64 @@
1
+ import csv, math, os, tempfile, typing
2
+ from pathlib import Path
3
+
4
+
5
+ class CSVSplitter:
6
+ def __init__(self, filepath, delimiter=','):
7
+ self.filepath = filepath
8
+ self.delimiter = delimiter
9
+ self.filename = os.path.basename(filepath)
10
+ self.file_size = os.path.getsize(filepath)
11
+ self.rows_count = self.get_row_count()
12
+
13
+ def get_row_count(self):
14
+ with open(self.filepath, "r", newline="", encoding="utf-8") as csvfile:
15
+ return sum(1 for _ in csvfile) - 1
16
+
17
+ def determine_splits(self):
18
+ row_mod = int(self.rows_count / 1000) + 1
19
+ file_mod = int(self.file_size / 1024 / 1024) + 1
20
+
21
+ return max(row_mod, file_mod)
22
+
23
+ def split(self):
24
+ splits = self.determine_splits()
25
+ if splits < 2:
26
+ return [Path(self.filepath)]
27
+
28
+ rows_per_file = math.ceil(self.rows_count / splits)
29
+
30
+ split_files: typing.List[Path] = []
31
+ with open(self.filepath, "r", newline="", encoding="utf-8") as csvfile:
32
+ reader = csv.reader(csvfile, delimiter=self.delimiter)
33
+ headers = next(reader)
34
+
35
+ temp_dir = tempfile.mkdtemp()
36
+
37
+ current_file_number = 1
38
+ current_row = 0
39
+ current_writer = None
40
+ current_output_file = None
41
+
42
+ for row in reader:
43
+ if current_row % rows_per_file == 0:
44
+ if current_output_file:
45
+ current_output_file.close()
46
+ output_file_path = os.path.join(
47
+ temp_dir,
48
+ f"{os.path.splitext(self.filename)[0]}_{current_file_number}.csv",
49
+ )
50
+ split_files.append(Path(output_file_path))
51
+ current_output_file = open(
52
+ output_file_path, "w", newline="", encoding="utf-8"
53
+ )
54
+ current_writer = csv.writer(current_output_file, delimiter=self.delimiter)
55
+ current_writer.writerow(headers)
56
+ current_file_number += 1
57
+
58
+ current_writer.writerow(row)
59
+ current_row += 1
60
+
61
+ if current_output_file:
62
+ current_output_file.close()
63
+
64
+ return split_files
groundx/ingest.py CHANGED
@@ -5,8 +5,8 @@ from urllib.parse import urlparse, urlunparse
5
5
 
6
6
  from .client import GroundXBase, AsyncGroundXBase
7
7
  from .core.request_options import RequestOptions
8
+ from .csv_splitter import CSVSplitter
8
9
  from .types.document import Document
9
- from .types.document_type import DocumentType
10
10
  from .types.ingest_remote_document import IngestRemoteDocument
11
11
  from .types.ingest_response import IngestResponse
12
12
 
@@ -38,10 +38,18 @@ MIME_TO_DOCUMENT_TYPE = {v: k for k, v in DOCUMENT_TYPE_TO_MIME.items()}
38
38
 
39
39
  ALLOWED_SUFFIXES = {f".{k}": v for k, v in DOCUMENT_TYPE_TO_MIME.items()}
40
40
 
41
+ CSV_SPLITS = {
42
+ ".csv": True,
43
+ }
44
+ TSV_SPLITS = {
45
+ ".tsv": True,
46
+ }
47
+
41
48
  SUFFIX_ALIASES = {
42
- ".jpeg": ".jpg",
43
- ".heic": ".heif",
44
- ".tif": ".tiff",
49
+ ".jpeg": "jpg",
50
+ ".heic": "heif",
51
+ ".tif": "tiff",
52
+ ".md": "txt",
45
53
  }
46
54
 
47
55
  MAX_BATCH_SIZE = 50
@@ -115,6 +123,18 @@ def prep_documents(
115
123
  return remote_documents, local_documents
116
124
 
117
125
 
126
+ def split_doc(file):
127
+ if file.is_file() and (
128
+ file.suffix.lower() in ALLOWED_SUFFIXES
129
+ or file.suffix.lower() in SUFFIX_ALIASES
130
+ ):
131
+ if file.suffix.lower() in CSV_SPLITS:
132
+ return CSVSplitter(filepath=file).split()
133
+ elif file.suffix.lower() in TSV_SPLITS:
134
+ return CSVSplitter(filepath=file, delimiter='\t').split()
135
+ return [file]
136
+ return []
137
+
118
138
  class GroundX(GroundXBase):
119
139
  def ingest(
120
140
  self,
@@ -163,41 +183,39 @@ class GroundX(GroundXBase):
163
183
  """
164
184
  remote_documents, local_documents = prep_documents(documents)
165
185
 
166
- if local_documents and remote_documents:
167
- raise ValueError("Documents must all be either local or remote, not a mix.")
168
-
169
- if len(remote_documents) > 0:
170
- if len(remote_documents) > MAX_BATCH_SIZE:
171
- raise ValueError("You have sent too many documents in this request")
172
-
173
- return self.documents.ingest_remote(
174
- documents=remote_documents,
175
- request_options=request_options,
176
- )
177
-
178
- if len(local_documents) > MAX_BATCH_SIZE:
186
+ if len(remote_documents) + len(local_documents) > MAX_BATCH_SIZE:
179
187
  raise ValueError("You have sent too many documents in this request")
180
188
 
181
- if len(local_documents) == 0:
189
+ if len(remote_documents) + len(local_documents) == 0:
182
190
  raise ValueError("No valid documents were provided")
183
191
 
184
- docs: typing.List[IngestRemoteDocument] = []
185
192
  for d in local_documents:
186
- url = self._upload_file(upload_api, Path(os.path.expanduser(d.file_path)))
187
-
188
- docs.append(
189
- IngestRemoteDocument(
190
- bucket_id=d.bucket_id,
191
- file_name=d.file_name,
192
- file_type=d.file_type,
193
- process_level=d.process_level,
194
- search_data=d.search_data,
195
- source_url=url,
193
+ splits = split_doc(Path(os.path.expanduser(d.file_path)))
194
+
195
+ for sd in splits:
196
+ url = self._upload_file(upload_api, sd)
197
+
198
+ ft = d.file_type
199
+ if sd.suffix.lower() in SUFFIX_ALIASES:
200
+ ft = SUFFIX_ALIASES[sd.suffix.lower()]
201
+
202
+ fn = sd.name
203
+ if len(splits) == 1 and d.file_name:
204
+ fn = d.file_name
205
+
206
+ remote_documents.append(
207
+ IngestRemoteDocument(
208
+ bucket_id=d.bucket_id,
209
+ file_name=fn,
210
+ file_type=ft,
211
+ process_level=d.process_level,
212
+ search_data=d.search_data,
213
+ source_url=url,
214
+ )
196
215
  )
197
- )
198
216
 
199
217
  return self.documents.ingest_remote(
200
- documents=docs,
218
+ documents=remote_documents,
201
219
  request_options=request_options,
202
220
  )
203
221
 
@@ -252,14 +270,10 @@ class GroundX(GroundXBase):
252
270
  def load_directory_files(directory: str) -> typing.List[Path]:
253
271
  dir_path = Path(directory)
254
272
 
255
- matched_files = [
256
- file
257
- for file in dir_path.rglob("*")
258
- if file.is_file() and (
259
- file.suffix.lower() in ALLOWED_SUFFIXES
260
- or file.suffix.lower() in SUFFIX_ALIASES
261
- )
262
- ]
273
+ matched_files: typing.List[Path] = []
274
+ for file in dir_path.rglob("*"):
275
+ for sd in split_doc(file):
276
+ matched_files.append(sd)
263
277
 
264
278
  return matched_files
265
279
 
@@ -301,6 +315,8 @@ class GroundX(GroundXBase):
301
315
  ):
302
316
  file_name = os.path.basename(file_path)
303
317
  file_extension = os.path.splitext(file_name)[1][1:].lower()
318
+ if f".{file_extension}" in SUFFIX_ALIASES:
319
+ file_extension = SUFFIX_ALIASES[f".{file_extension}"]
304
320
 
305
321
  presigned_info = get_presigned_url(endpoint, file_name, file_extension)
306
322
 
@@ -343,12 +359,23 @@ class GroundX(GroundXBase):
343
359
  progress = len(batch)
344
360
  for file in batch:
345
361
  url = self._upload_file(upload_api, file)
346
- docs.append(
347
- Document(
348
- bucket_id=bucket_id,
349
- file_path=url,
350
- ),
351
- )
362
+ if file.suffix.lower() in SUFFIX_ALIASES:
363
+ docs.append(
364
+ Document(
365
+ bucket_id=bucket_id,
366
+ file_name=file.name,
367
+ file_path=url,
368
+ file_type=SUFFIX_ALIASES[file.suffix.lower()],
369
+ ),
370
+ )
371
+ else:
372
+ docs.append(
373
+ Document(
374
+ bucket_id=bucket_id,
375
+ file_name=file.name,
376
+ file_path=url,
377
+ ),
378
+ )
352
379
  pbar.update(0.25)
353
380
  progress -= 0.25
354
381
 
@@ -364,11 +391,28 @@ class GroundX(GroundXBase):
364
391
  time.sleep(3)
365
392
  ingest = self.documents.get_processing_status_by_id(ingest.ingest.process_id)
366
393
 
367
- if ingest.ingest.progress and ingest.ingest.progress.processing:
368
- for doc in ingest.ingest.progress.processing.documents:
369
- if doc.status == "complete" and doc.document_id not in completed_files:
370
- pbar.update(0.75)
371
- progress -= 0.75
394
+ if ingest.ingest.progress:
395
+ if ingest.ingest.progress.processing and ingest.ingest.progress.processing.documents:
396
+ for doc in ingest.ingest.progress.processing.documents:
397
+ if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
398
+ pbar.update(0.75)
399
+ progress -= 0.75
400
+ if ingest.ingest.progress.complete and ingest.ingest.progress.complete.documents:
401
+ for doc in ingest.ingest.progress.complete.documents:
402
+ if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
403
+ pbar.update(0.75)
404
+ progress -= 0.75
405
+ if ingest.ingest.progress.cancelled and ingest.ingest.progress.cancelled.documents:
406
+ for doc in ingest.ingest.progress.cancelled.documents:
407
+ if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
408
+ pbar.update(0.75)
409
+ progress -= 0.75
410
+ if ingest.ingest.progress.errors and ingest.ingest.progress.errors.documents:
411
+ for doc in ingest.ingest.progress.errors.documents:
412
+ if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
413
+ pbar.update(0.75)
414
+ progress -= 0.75
415
+
372
416
 
373
417
  if ingest.ingest.status in ["error", "cancelled"]:
374
418
  raise ValueError(f"Ingest failed with status: {ingest.ingest.status}")
@@ -431,41 +475,39 @@ class AsyncGroundX(AsyncGroundXBase):
431
475
  """
432
476
  remote_documents, local_documents = prep_documents(documents)
433
477
 
434
- if local_documents and remote_documents:
435
- raise ValueError("Documents must all be either local or remote, not a mix.")
436
-
437
- if len(remote_documents) > 0:
438
- if len(remote_documents) > MAX_BATCH_SIZE:
439
- raise ValueError("You have sent too many documents in this request")
440
-
441
- return await self.documents.ingest_remote(
442
- documents=remote_documents,
443
- request_options=request_options,
444
- )
445
-
446
- if len(local_documents) > MAX_BATCH_SIZE:
478
+ if len(remote_documents) + len(local_documents) > MAX_BATCH_SIZE:
447
479
  raise ValueError("You have sent too many documents in this request")
448
480
 
449
- if len(local_documents) == 0:
481
+ if len(remote_documents) + len(local_documents) == 0:
450
482
  raise ValueError("No valid documents were provided")
451
483
 
452
- docs: typing.List[IngestRemoteDocument] = []
453
484
  for d in local_documents:
454
- url = self._upload_file(upload_api, Path(os.path.expanduser(d.file_path)))
455
-
456
- docs.append(
457
- IngestRemoteDocument(
458
- bucket_id=d.bucket_id,
459
- file_name=d.file_name,
460
- file_type=d.file_type,
461
- process_level=d.process_level,
462
- search_data=d.search_data,
463
- source_url=url,
485
+ splits = split_doc(Path(os.path.expanduser(d.file_path)))
486
+
487
+ for sd in splits:
488
+ url = self._upload_file(upload_api, sd)
489
+
490
+ ft = d.file_type
491
+ if sd.suffix.lower() in SUFFIX_ALIASES:
492
+ ft = SUFFIX_ALIASES[sd.suffix.lower()]
493
+
494
+ fn = sd.name
495
+ if len(splits) == 1 and d.file_name:
496
+ fn = d.file_name
497
+
498
+ remote_documents.append(
499
+ IngestRemoteDocument(
500
+ bucket_id=d.bucket_id,
501
+ file_name=fn,
502
+ file_type=ft,
503
+ process_level=d.process_level,
504
+ search_data=d.search_data,
505
+ source_url=url,
506
+ )
464
507
  )
465
- )
466
508
 
467
509
  return await self.documents.ingest_remote(
468
- documents=docs,
510
+ documents=remote_documents,
469
511
  request_options=request_options,
470
512
  )
471
513
 
@@ -476,6 +518,8 @@ class AsyncGroundX(AsyncGroundXBase):
476
518
  ):
477
519
  file_name = os.path.basename(file_path)
478
520
  file_extension = os.path.splitext(file_name)[1][1:].lower()
521
+ if f".{file_extension}" in SUFFIX_ALIASES:
522
+ file_extension = SUFFIX_ALIASES[f".{file_extension}"]
479
523
 
480
524
  presigned_info = get_presigned_url(endpoint, file_name, file_extension)
481
525
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: groundx
3
- Version: 2.2.8
3
+ Version: 2.2.9
4
4
  Summary:
5
5
  License: MIT
6
6
  Requires-Python: >=3.8,<4.0
@@ -4,7 +4,7 @@ groundx/buckets/client.py,sha256=4jlc9vfIult1mMJ4FZW4_KFJybZPStZt1FUplIgrxbU,239
4
4
  groundx/client.py,sha256=dIW9OyrMyfC1N7HSxRrHh0w_8rJ8osNUOPdYD6ueQ6g,6515
5
5
  groundx/core/__init__.py,sha256=SQ85PF84B9MuKnBwHNHWemSGuy-g_515gFYNFhvEE0I,1438
6
6
  groundx/core/api_error.py,sha256=RE8LELok2QCjABadECTvtDp7qejA1VmINCh6TbqPwSE,426
7
- groundx/core/client_wrapper.py,sha256=DckEncGF_W3v8kguKAXWcn2um2B9k-nkWvKmH0HoyGQ,1802
7
+ groundx/core/client_wrapper.py,sha256=D6uZpUYxYzmgxNNCTN7quiFvNlBQmPOyLstnrXfcJcs,1802
8
8
  groundx/core/datetime_utils.py,sha256=nBys2IsYrhPdszxGKCNRPSOCwa-5DWOHG95FB8G9PKo,1047
9
9
  groundx/core/file.py,sha256=d4NNbX8XvXP32z8KpK2Xovv33nFfruIrpz0QWxlgpZk,2663
10
10
  groundx/core/http_client.py,sha256=Z77OIxIbL4OAB2IDqjRq_sYa5yNYAWfmdhdCSSvh6Y4,19552
@@ -14,6 +14,7 @@ groundx/core/query_encoder.py,sha256=ekulqNd0j8TgD7ox-Qbz7liqX8-KP9blvT9DsRCenYM
14
14
  groundx/core/remove_none_from_dict.py,sha256=EU9SGgYidWq7SexuJbNs4-PZ-5Bl3Vppd864mS6vQZw,342
15
15
  groundx/core/request_options.py,sha256=h0QUNCFVdCW_7GclVySCAY2w4NhtXVBUCmHgmzaxpcg,1681
16
16
  groundx/core/serialization.py,sha256=D9h_t-RQON3-CHWs1C4ESY9B-Yd5d-l5lnTLb_X896g,9601
17
+ groundx/csv_splitter.py,sha256=6HGXdDpwBX_IJaCbla1WuirJERBTvjLzBf9OBtwGFWU,2254
17
18
  groundx/customer/__init__.py,sha256=FTtvy8EDg9nNNg9WCatVgKTRYV8-_v1roeGPAKoa_pw,65
18
19
  groundx/customer/client.py,sha256=C_JANeDewRD1Kg-q7LPxdiOSWbYSTOiYlBYZLRYPI44,3467
19
20
  groundx/documents/__init__.py,sha256=FTtvy8EDg9nNNg9WCatVgKTRYV8-_v1roeGPAKoa_pw,65
@@ -26,7 +27,7 @@ groundx/groups/__init__.py,sha256=FTtvy8EDg9nNNg9WCatVgKTRYV8-_v1roeGPAKoa_pw,65
26
27
  groundx/groups/client.py,sha256=bytQRh9m7e4vIuYHb7dD1kCTQZvyBxedCqGnmmLqrsI,35237
27
28
  groundx/health/__init__.py,sha256=FTtvy8EDg9nNNg9WCatVgKTRYV8-_v1roeGPAKoa_pw,65
28
29
  groundx/health/client.py,sha256=fcTa21RWPyBuT77PQ0EncC6rBaW_DrYlRvudy9-0H58,7545
29
- groundx/ingest.py,sha256=mQB__GQmIDP6W5jGhmjORXh2o6HHTseHiahZz2Es-tM,16119
30
+ groundx/ingest.py,sha256=LtnUGcgtE1MNYL3PGFrzPqRMnLeOxr-fVsZ3fmTAUKI,18294
30
31
  groundx/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
32
  groundx/search/__init__.py,sha256=RagVzjShP33mDg9o4N3kGzV0egL1RYNjCpXPE8VzMYE,145
32
33
  groundx/search/client.py,sha256=zrrqFy0HowDUYPsMU4nfvDV2RgmkEQ4E8WYNktu3xcs,18684
@@ -81,7 +82,7 @@ groundx/types/subscription_detail.py,sha256=WNfUw2EMVECIvNYcV2s51zZ6T3Utc4zYXw63
81
82
  groundx/types/subscription_detail_meters.py,sha256=lBa8-1QlMVHjr5RLGqhiTKnD1KMM0AAHTWvz9TVtG8w,830
82
83
  groundx/types/website_source.py,sha256=3WeRCiilNKKBTfhwgjo3jbcVI3vLTeM-KxI6dVzpg9o,1578
83
84
  groundx/version.py,sha256=1yVogKaq260fQfckM2RYN2144SEw0QROsZW8ICtkG4U,74
84
- groundx-2.2.8.dist-info/LICENSE,sha256=dFE6nY1bHnSn6NqmdlghlU1gQqLqYNphrceGVehSa7o,1065
85
- groundx-2.2.8.dist-info/METADATA,sha256=Q67rLMlsO72-NH2qvK8Bw4ciSTDS5C4LNoPiBk8UWyI,5173
86
- groundx-2.2.8.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
87
- groundx-2.2.8.dist-info/RECORD,,
85
+ groundx-2.2.9.dist-info/LICENSE,sha256=dFE6nY1bHnSn6NqmdlghlU1gQqLqYNphrceGVehSa7o,1065
86
+ groundx-2.2.9.dist-info/METADATA,sha256=1BWmC2-Lx8AT1vEY0juSK4ugjxyn9W5ndJy0bomFEwQ,5173
87
+ groundx-2.2.9.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
88
+ groundx-2.2.9.dist-info/RECORD,,