groundx 2.2.8__py3-none-any.whl → 2.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- groundx/core/client_wrapper.py +1 -1
- groundx/csv_splitter.py +64 -0
- groundx/ingest.py +121 -77
- {groundx-2.2.8.dist-info → groundx-2.2.9.dist-info}/METADATA +1 -1
- {groundx-2.2.8.dist-info → groundx-2.2.9.dist-info}/RECORD +7 -6
- {groundx-2.2.8.dist-info → groundx-2.2.9.dist-info}/LICENSE +0 -0
- {groundx-2.2.8.dist-info → groundx-2.2.9.dist-info}/WHEEL +0 -0
groundx/core/client_wrapper.py
CHANGED
groundx/csv_splitter.py
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
import csv, math, os, tempfile, typing
|
2
|
+
from pathlib import Path
|
3
|
+
|
4
|
+
|
5
|
+
class CSVSplitter:
|
6
|
+
def __init__(self, filepath, delimiter=','):
|
7
|
+
self.filepath = filepath
|
8
|
+
self.delimiter = delimiter
|
9
|
+
self.filename = os.path.basename(filepath)
|
10
|
+
self.file_size = os.path.getsize(filepath)
|
11
|
+
self.rows_count = self.get_row_count()
|
12
|
+
|
13
|
+
def get_row_count(self):
|
14
|
+
with open(self.filepath, "r", newline="", encoding="utf-8") as csvfile:
|
15
|
+
return sum(1 for _ in csvfile) - 1
|
16
|
+
|
17
|
+
def determine_splits(self):
|
18
|
+
row_mod = int(self.rows_count / 1000) + 1
|
19
|
+
file_mod = int(self.file_size / 1024 / 1024) + 1
|
20
|
+
|
21
|
+
return max(row_mod, file_mod)
|
22
|
+
|
23
|
+
def split(self):
|
24
|
+
splits = self.determine_splits()
|
25
|
+
if splits < 2:
|
26
|
+
return [Path(self.filepath)]
|
27
|
+
|
28
|
+
rows_per_file = math.ceil(self.rows_count / splits)
|
29
|
+
|
30
|
+
split_files: typing.List[Path] = []
|
31
|
+
with open(self.filepath, "r", newline="", encoding="utf-8") as csvfile:
|
32
|
+
reader = csv.reader(csvfile, delimiter=self.delimiter)
|
33
|
+
headers = next(reader)
|
34
|
+
|
35
|
+
temp_dir = tempfile.mkdtemp()
|
36
|
+
|
37
|
+
current_file_number = 1
|
38
|
+
current_row = 0
|
39
|
+
current_writer = None
|
40
|
+
current_output_file = None
|
41
|
+
|
42
|
+
for row in reader:
|
43
|
+
if current_row % rows_per_file == 0:
|
44
|
+
if current_output_file:
|
45
|
+
current_output_file.close()
|
46
|
+
output_file_path = os.path.join(
|
47
|
+
temp_dir,
|
48
|
+
f"{os.path.splitext(self.filename)[0]}_{current_file_number}.csv",
|
49
|
+
)
|
50
|
+
split_files.append(Path(output_file_path))
|
51
|
+
current_output_file = open(
|
52
|
+
output_file_path, "w", newline="", encoding="utf-8"
|
53
|
+
)
|
54
|
+
current_writer = csv.writer(current_output_file, delimiter=self.delimiter)
|
55
|
+
current_writer.writerow(headers)
|
56
|
+
current_file_number += 1
|
57
|
+
|
58
|
+
current_writer.writerow(row)
|
59
|
+
current_row += 1
|
60
|
+
|
61
|
+
if current_output_file:
|
62
|
+
current_output_file.close()
|
63
|
+
|
64
|
+
return split_files
|
groundx/ingest.py
CHANGED
@@ -5,8 +5,8 @@ from urllib.parse import urlparse, urlunparse
|
|
5
5
|
|
6
6
|
from .client import GroundXBase, AsyncGroundXBase
|
7
7
|
from .core.request_options import RequestOptions
|
8
|
+
from .csv_splitter import CSVSplitter
|
8
9
|
from .types.document import Document
|
9
|
-
from .types.document_type import DocumentType
|
10
10
|
from .types.ingest_remote_document import IngestRemoteDocument
|
11
11
|
from .types.ingest_response import IngestResponse
|
12
12
|
|
@@ -38,10 +38,18 @@ MIME_TO_DOCUMENT_TYPE = {v: k for k, v in DOCUMENT_TYPE_TO_MIME.items()}
|
|
38
38
|
|
39
39
|
ALLOWED_SUFFIXES = {f".{k}": v for k, v in DOCUMENT_TYPE_TO_MIME.items()}
|
40
40
|
|
41
|
+
CSV_SPLITS = {
|
42
|
+
".csv": True,
|
43
|
+
}
|
44
|
+
TSV_SPLITS = {
|
45
|
+
".tsv": True,
|
46
|
+
}
|
47
|
+
|
41
48
|
SUFFIX_ALIASES = {
|
42
|
-
".jpeg": "
|
43
|
-
".heic": "
|
44
|
-
".tif": "
|
49
|
+
".jpeg": "jpg",
|
50
|
+
".heic": "heif",
|
51
|
+
".tif": "tiff",
|
52
|
+
".md": "txt",
|
45
53
|
}
|
46
54
|
|
47
55
|
MAX_BATCH_SIZE = 50
|
@@ -115,6 +123,18 @@ def prep_documents(
|
|
115
123
|
return remote_documents, local_documents
|
116
124
|
|
117
125
|
|
126
|
+
def split_doc(file):
|
127
|
+
if file.is_file() and (
|
128
|
+
file.suffix.lower() in ALLOWED_SUFFIXES
|
129
|
+
or file.suffix.lower() in SUFFIX_ALIASES
|
130
|
+
):
|
131
|
+
if file.suffix.lower() in CSV_SPLITS:
|
132
|
+
return CSVSplitter(filepath=file).split()
|
133
|
+
elif file.suffix.lower() in TSV_SPLITS:
|
134
|
+
return CSVSplitter(filepath=file, delimiter='\t').split()
|
135
|
+
return [file]
|
136
|
+
return []
|
137
|
+
|
118
138
|
class GroundX(GroundXBase):
|
119
139
|
def ingest(
|
120
140
|
self,
|
@@ -163,41 +183,39 @@ class GroundX(GroundXBase):
|
|
163
183
|
"""
|
164
184
|
remote_documents, local_documents = prep_documents(documents)
|
165
185
|
|
166
|
-
if local_documents
|
167
|
-
raise ValueError("Documents must all be either local or remote, not a mix.")
|
168
|
-
|
169
|
-
if len(remote_documents) > 0:
|
170
|
-
if len(remote_documents) > MAX_BATCH_SIZE:
|
171
|
-
raise ValueError("You have sent too many documents in this request")
|
172
|
-
|
173
|
-
return self.documents.ingest_remote(
|
174
|
-
documents=remote_documents,
|
175
|
-
request_options=request_options,
|
176
|
-
)
|
177
|
-
|
178
|
-
if len(local_documents) > MAX_BATCH_SIZE:
|
186
|
+
if len(remote_documents) + len(local_documents) > MAX_BATCH_SIZE:
|
179
187
|
raise ValueError("You have sent too many documents in this request")
|
180
188
|
|
181
|
-
if len(local_documents) == 0:
|
189
|
+
if len(remote_documents) + len(local_documents) == 0:
|
182
190
|
raise ValueError("No valid documents were provided")
|
183
191
|
|
184
|
-
docs: typing.List[IngestRemoteDocument] = []
|
185
192
|
for d in local_documents:
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
193
|
+
splits = split_doc(Path(os.path.expanduser(d.file_path)))
|
194
|
+
|
195
|
+
for sd in splits:
|
196
|
+
url = self._upload_file(upload_api, sd)
|
197
|
+
|
198
|
+
ft = d.file_type
|
199
|
+
if sd.suffix.lower() in SUFFIX_ALIASES:
|
200
|
+
ft = SUFFIX_ALIASES[sd.suffix.lower()]
|
201
|
+
|
202
|
+
fn = sd.name
|
203
|
+
if len(splits) == 1 and d.file_name:
|
204
|
+
fn = d.file_name
|
205
|
+
|
206
|
+
remote_documents.append(
|
207
|
+
IngestRemoteDocument(
|
208
|
+
bucket_id=d.bucket_id,
|
209
|
+
file_name=fn,
|
210
|
+
file_type=ft,
|
211
|
+
process_level=d.process_level,
|
212
|
+
search_data=d.search_data,
|
213
|
+
source_url=url,
|
214
|
+
)
|
196
215
|
)
|
197
|
-
)
|
198
216
|
|
199
217
|
return self.documents.ingest_remote(
|
200
|
-
documents=
|
218
|
+
documents=remote_documents,
|
201
219
|
request_options=request_options,
|
202
220
|
)
|
203
221
|
|
@@ -252,14 +270,10 @@ class GroundX(GroundXBase):
|
|
252
270
|
def load_directory_files(directory: str) -> typing.List[Path]:
|
253
271
|
dir_path = Path(directory)
|
254
272
|
|
255
|
-
matched_files = [
|
256
|
-
|
257
|
-
for
|
258
|
-
|
259
|
-
file.suffix.lower() in ALLOWED_SUFFIXES
|
260
|
-
or file.suffix.lower() in SUFFIX_ALIASES
|
261
|
-
)
|
262
|
-
]
|
273
|
+
matched_files: typing.List[Path] = []
|
274
|
+
for file in dir_path.rglob("*"):
|
275
|
+
for sd in split_doc(file):
|
276
|
+
matched_files.append(sd)
|
263
277
|
|
264
278
|
return matched_files
|
265
279
|
|
@@ -301,6 +315,8 @@ class GroundX(GroundXBase):
|
|
301
315
|
):
|
302
316
|
file_name = os.path.basename(file_path)
|
303
317
|
file_extension = os.path.splitext(file_name)[1][1:].lower()
|
318
|
+
if f".{file_extension}" in SUFFIX_ALIASES:
|
319
|
+
file_extension = SUFFIX_ALIASES[f".{file_extension}"]
|
304
320
|
|
305
321
|
presigned_info = get_presigned_url(endpoint, file_name, file_extension)
|
306
322
|
|
@@ -343,12 +359,23 @@ class GroundX(GroundXBase):
|
|
343
359
|
progress = len(batch)
|
344
360
|
for file in batch:
|
345
361
|
url = self._upload_file(upload_api, file)
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
362
|
+
if file.suffix.lower() in SUFFIX_ALIASES:
|
363
|
+
docs.append(
|
364
|
+
Document(
|
365
|
+
bucket_id=bucket_id,
|
366
|
+
file_name=file.name,
|
367
|
+
file_path=url,
|
368
|
+
file_type=SUFFIX_ALIASES[file.suffix.lower()],
|
369
|
+
),
|
370
|
+
)
|
371
|
+
else:
|
372
|
+
docs.append(
|
373
|
+
Document(
|
374
|
+
bucket_id=bucket_id,
|
375
|
+
file_name=file.name,
|
376
|
+
file_path=url,
|
377
|
+
),
|
378
|
+
)
|
352
379
|
pbar.update(0.25)
|
353
380
|
progress -= 0.25
|
354
381
|
|
@@ -364,11 +391,28 @@ class GroundX(GroundXBase):
|
|
364
391
|
time.sleep(3)
|
365
392
|
ingest = self.documents.get_processing_status_by_id(ingest.ingest.process_id)
|
366
393
|
|
367
|
-
if ingest.ingest.progress
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
394
|
+
if ingest.ingest.progress:
|
395
|
+
if ingest.ingest.progress.processing and ingest.ingest.progress.processing.documents:
|
396
|
+
for doc in ingest.ingest.progress.processing.documents:
|
397
|
+
if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
|
398
|
+
pbar.update(0.75)
|
399
|
+
progress -= 0.75
|
400
|
+
if ingest.ingest.progress.complete and ingest.ingest.progress.complete.documents:
|
401
|
+
for doc in ingest.ingest.progress.complete.documents:
|
402
|
+
if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
|
403
|
+
pbar.update(0.75)
|
404
|
+
progress -= 0.75
|
405
|
+
if ingest.ingest.progress.cancelled and ingest.ingest.progress.cancelled.documents:
|
406
|
+
for doc in ingest.ingest.progress.cancelled.documents:
|
407
|
+
if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
|
408
|
+
pbar.update(0.75)
|
409
|
+
progress -= 0.75
|
410
|
+
if ingest.ingest.progress.errors and ingest.ingest.progress.errors.documents:
|
411
|
+
for doc in ingest.ingest.progress.errors.documents:
|
412
|
+
if doc.status in ["complete", "error", "cancelled"] and doc.document_id not in completed_files:
|
413
|
+
pbar.update(0.75)
|
414
|
+
progress -= 0.75
|
415
|
+
|
372
416
|
|
373
417
|
if ingest.ingest.status in ["error", "cancelled"]:
|
374
418
|
raise ValueError(f"Ingest failed with status: {ingest.ingest.status}")
|
@@ -431,41 +475,39 @@ class AsyncGroundX(AsyncGroundXBase):
|
|
431
475
|
"""
|
432
476
|
remote_documents, local_documents = prep_documents(documents)
|
433
477
|
|
434
|
-
if local_documents
|
435
|
-
raise ValueError("Documents must all be either local or remote, not a mix.")
|
436
|
-
|
437
|
-
if len(remote_documents) > 0:
|
438
|
-
if len(remote_documents) > MAX_BATCH_SIZE:
|
439
|
-
raise ValueError("You have sent too many documents in this request")
|
440
|
-
|
441
|
-
return await self.documents.ingest_remote(
|
442
|
-
documents=remote_documents,
|
443
|
-
request_options=request_options,
|
444
|
-
)
|
445
|
-
|
446
|
-
if len(local_documents) > MAX_BATCH_SIZE:
|
478
|
+
if len(remote_documents) + len(local_documents) > MAX_BATCH_SIZE:
|
447
479
|
raise ValueError("You have sent too many documents in this request")
|
448
480
|
|
449
|
-
if len(local_documents) == 0:
|
481
|
+
if len(remote_documents) + len(local_documents) == 0:
|
450
482
|
raise ValueError("No valid documents were provided")
|
451
483
|
|
452
|
-
docs: typing.List[IngestRemoteDocument] = []
|
453
484
|
for d in local_documents:
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
485
|
+
splits = split_doc(Path(os.path.expanduser(d.file_path)))
|
486
|
+
|
487
|
+
for sd in splits:
|
488
|
+
url = self._upload_file(upload_api, sd)
|
489
|
+
|
490
|
+
ft = d.file_type
|
491
|
+
if sd.suffix.lower() in SUFFIX_ALIASES:
|
492
|
+
ft = SUFFIX_ALIASES[sd.suffix.lower()]
|
493
|
+
|
494
|
+
fn = sd.name
|
495
|
+
if len(splits) == 1 and d.file_name:
|
496
|
+
fn = d.file_name
|
497
|
+
|
498
|
+
remote_documents.append(
|
499
|
+
IngestRemoteDocument(
|
500
|
+
bucket_id=d.bucket_id,
|
501
|
+
file_name=fn,
|
502
|
+
file_type=ft,
|
503
|
+
process_level=d.process_level,
|
504
|
+
search_data=d.search_data,
|
505
|
+
source_url=url,
|
506
|
+
)
|
464
507
|
)
|
465
|
-
)
|
466
508
|
|
467
509
|
return await self.documents.ingest_remote(
|
468
|
-
documents=
|
510
|
+
documents=remote_documents,
|
469
511
|
request_options=request_options,
|
470
512
|
)
|
471
513
|
|
@@ -476,6 +518,8 @@ class AsyncGroundX(AsyncGroundXBase):
|
|
476
518
|
):
|
477
519
|
file_name = os.path.basename(file_path)
|
478
520
|
file_extension = os.path.splitext(file_name)[1][1:].lower()
|
521
|
+
if f".{file_extension}" in SUFFIX_ALIASES:
|
522
|
+
file_extension = SUFFIX_ALIASES[f".{file_extension}"]
|
479
523
|
|
480
524
|
presigned_info = get_presigned_url(endpoint, file_name, file_extension)
|
481
525
|
|
@@ -4,7 +4,7 @@ groundx/buckets/client.py,sha256=4jlc9vfIult1mMJ4FZW4_KFJybZPStZt1FUplIgrxbU,239
|
|
4
4
|
groundx/client.py,sha256=dIW9OyrMyfC1N7HSxRrHh0w_8rJ8osNUOPdYD6ueQ6g,6515
|
5
5
|
groundx/core/__init__.py,sha256=SQ85PF84B9MuKnBwHNHWemSGuy-g_515gFYNFhvEE0I,1438
|
6
6
|
groundx/core/api_error.py,sha256=RE8LELok2QCjABadECTvtDp7qejA1VmINCh6TbqPwSE,426
|
7
|
-
groundx/core/client_wrapper.py,sha256=
|
7
|
+
groundx/core/client_wrapper.py,sha256=D6uZpUYxYzmgxNNCTN7quiFvNlBQmPOyLstnrXfcJcs,1802
|
8
8
|
groundx/core/datetime_utils.py,sha256=nBys2IsYrhPdszxGKCNRPSOCwa-5DWOHG95FB8G9PKo,1047
|
9
9
|
groundx/core/file.py,sha256=d4NNbX8XvXP32z8KpK2Xovv33nFfruIrpz0QWxlgpZk,2663
|
10
10
|
groundx/core/http_client.py,sha256=Z77OIxIbL4OAB2IDqjRq_sYa5yNYAWfmdhdCSSvh6Y4,19552
|
@@ -14,6 +14,7 @@ groundx/core/query_encoder.py,sha256=ekulqNd0j8TgD7ox-Qbz7liqX8-KP9blvT9DsRCenYM
|
|
14
14
|
groundx/core/remove_none_from_dict.py,sha256=EU9SGgYidWq7SexuJbNs4-PZ-5Bl3Vppd864mS6vQZw,342
|
15
15
|
groundx/core/request_options.py,sha256=h0QUNCFVdCW_7GclVySCAY2w4NhtXVBUCmHgmzaxpcg,1681
|
16
16
|
groundx/core/serialization.py,sha256=D9h_t-RQON3-CHWs1C4ESY9B-Yd5d-l5lnTLb_X896g,9601
|
17
|
+
groundx/csv_splitter.py,sha256=6HGXdDpwBX_IJaCbla1WuirJERBTvjLzBf9OBtwGFWU,2254
|
17
18
|
groundx/customer/__init__.py,sha256=FTtvy8EDg9nNNg9WCatVgKTRYV8-_v1roeGPAKoa_pw,65
|
18
19
|
groundx/customer/client.py,sha256=C_JANeDewRD1Kg-q7LPxdiOSWbYSTOiYlBYZLRYPI44,3467
|
19
20
|
groundx/documents/__init__.py,sha256=FTtvy8EDg9nNNg9WCatVgKTRYV8-_v1roeGPAKoa_pw,65
|
@@ -26,7 +27,7 @@ groundx/groups/__init__.py,sha256=FTtvy8EDg9nNNg9WCatVgKTRYV8-_v1roeGPAKoa_pw,65
|
|
26
27
|
groundx/groups/client.py,sha256=bytQRh9m7e4vIuYHb7dD1kCTQZvyBxedCqGnmmLqrsI,35237
|
27
28
|
groundx/health/__init__.py,sha256=FTtvy8EDg9nNNg9WCatVgKTRYV8-_v1roeGPAKoa_pw,65
|
28
29
|
groundx/health/client.py,sha256=fcTa21RWPyBuT77PQ0EncC6rBaW_DrYlRvudy9-0H58,7545
|
29
|
-
groundx/ingest.py,sha256=
|
30
|
+
groundx/ingest.py,sha256=LtnUGcgtE1MNYL3PGFrzPqRMnLeOxr-fVsZ3fmTAUKI,18294
|
30
31
|
groundx/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
31
32
|
groundx/search/__init__.py,sha256=RagVzjShP33mDg9o4N3kGzV0egL1RYNjCpXPE8VzMYE,145
|
32
33
|
groundx/search/client.py,sha256=zrrqFy0HowDUYPsMU4nfvDV2RgmkEQ4E8WYNktu3xcs,18684
|
@@ -81,7 +82,7 @@ groundx/types/subscription_detail.py,sha256=WNfUw2EMVECIvNYcV2s51zZ6T3Utc4zYXw63
|
|
81
82
|
groundx/types/subscription_detail_meters.py,sha256=lBa8-1QlMVHjr5RLGqhiTKnD1KMM0AAHTWvz9TVtG8w,830
|
82
83
|
groundx/types/website_source.py,sha256=3WeRCiilNKKBTfhwgjo3jbcVI3vLTeM-KxI6dVzpg9o,1578
|
83
84
|
groundx/version.py,sha256=1yVogKaq260fQfckM2RYN2144SEw0QROsZW8ICtkG4U,74
|
84
|
-
groundx-2.2.
|
85
|
-
groundx-2.2.
|
86
|
-
groundx-2.2.
|
87
|
-
groundx-2.2.
|
85
|
+
groundx-2.2.9.dist-info/LICENSE,sha256=dFE6nY1bHnSn6NqmdlghlU1gQqLqYNphrceGVehSa7o,1065
|
86
|
+
groundx-2.2.9.dist-info/METADATA,sha256=1BWmC2-Lx8AT1vEY0juSK4ugjxyn9W5ndJy0bomFEwQ,5173
|
87
|
+
groundx-2.2.9.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
|
88
|
+
groundx-2.2.9.dist-info/RECORD,,
|
File without changes
|
File without changes
|