airbyte-cdk 6.6.1__py3-none-any.whl → 6.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +69 -19
- {airbyte_cdk-6.6.1.dist-info → airbyte_cdk-6.6.2.dist-info}/METADATA +3 -4
- {airbyte_cdk-6.6.1.dist-info → airbyte_cdk-6.6.2.dist-info}/RECORD +5 -5
- {airbyte_cdk-6.6.1.dist-info → airbyte_cdk-6.6.2.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.6.1.dist-info → airbyte_cdk-6.6.2.dist-info}/WHEEL +0 -0
@@ -29,16 +29,25 @@ from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
|
|
29
29
|
from airbyte_cdk.utils import is_cloud_environment
|
30
30
|
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
31
31
|
from unstructured.file_utils.filetype import (
|
32
|
+
EXT_TO_FILETYPE,
|
32
33
|
FILETYPE_TO_MIMETYPE,
|
33
34
|
STR_TO_FILETYPE,
|
34
35
|
FileType,
|
35
36
|
detect_filetype,
|
36
37
|
)
|
38
|
+
import nltk
|
37
39
|
|
38
40
|
unstructured_partition_pdf = None
|
39
41
|
unstructured_partition_docx = None
|
40
42
|
unstructured_partition_pptx = None
|
41
43
|
|
44
|
+
try:
|
45
|
+
nltk.data.find("tokenizers/punkt.zip")
|
46
|
+
nltk.data.find("tokenizers/punkt_tab.zip")
|
47
|
+
except LookupError:
|
48
|
+
nltk.download("punkt")
|
49
|
+
nltk.download("punkt_tab")
|
50
|
+
|
42
51
|
|
43
52
|
def optional_decode(contents: Union[str, bytes]) -> str:
|
44
53
|
if isinstance(contents, bytes):
|
@@ -108,9 +117,11 @@ class UnstructuredParser(FileTypeParser):
|
|
108
117
|
format = _extract_format(config)
|
109
118
|
with stream_reader.open_file(file, self.file_read_mode, None, logger) as file_handle:
|
110
119
|
filetype = self._get_filetype(file_handle, file)
|
111
|
-
|
112
120
|
if filetype not in self._supported_file_types() and not format.skip_unprocessable_files:
|
113
|
-
raise self._create_parse_error(
|
121
|
+
raise self._create_parse_error(
|
122
|
+
file,
|
123
|
+
self._get_file_type_error_message(filetype),
|
124
|
+
)
|
114
125
|
|
115
126
|
return {
|
116
127
|
"content": {
|
@@ -159,6 +170,10 @@ class UnstructuredParser(FileTypeParser):
|
|
159
170
|
logger.warn(f"File {file.uri} cannot be parsed. Skipping it.")
|
160
171
|
else:
|
161
172
|
raise e
|
173
|
+
except Exception as e:
|
174
|
+
exception_str = str(e)
|
175
|
+
logger.error(f"File {file.uri} caused an error during parsing: {exception_str}.")
|
176
|
+
raise e
|
162
177
|
|
163
178
|
def _read_file(
|
164
179
|
self,
|
@@ -176,20 +191,32 @@ class UnstructuredParser(FileTypeParser):
|
|
176
191
|
# check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point)
|
177
192
|
raise Exception("unstructured library is not available")
|
178
193
|
|
179
|
-
filetype = self._get_filetype(file_handle, remote_file)
|
194
|
+
filetype: FileType | None = self._get_filetype(file_handle, remote_file)
|
180
195
|
|
181
|
-
if filetype
|
196
|
+
if filetype is None or filetype not in self._supported_file_types():
|
197
|
+
raise self._create_parse_error(
|
198
|
+
remote_file,
|
199
|
+
self._get_file_type_error_message(filetype),
|
200
|
+
)
|
201
|
+
if filetype in {FileType.MD, FileType.TXT}:
|
182
202
|
file_content: bytes = file_handle.read()
|
183
203
|
decoded_content: str = optional_decode(file_content)
|
184
204
|
return decoded_content
|
185
|
-
if filetype not in self._supported_file_types():
|
186
|
-
raise self._create_parse_error(remote_file, self._get_file_type_error_message(filetype))
|
187
205
|
if format.processing.mode == "local":
|
188
|
-
return self._read_file_locally(
|
206
|
+
return self._read_file_locally(
|
207
|
+
file_handle,
|
208
|
+
filetype,
|
209
|
+
format.strategy,
|
210
|
+
remote_file,
|
211
|
+
)
|
189
212
|
elif format.processing.mode == "api":
|
190
213
|
try:
|
191
214
|
result: str = self._read_file_remotely_with_retries(
|
192
|
-
file_handle,
|
215
|
+
file_handle,
|
216
|
+
format.processing,
|
217
|
+
filetype,
|
218
|
+
format.strategy,
|
219
|
+
remote_file,
|
193
220
|
)
|
194
221
|
except Exception as e:
|
195
222
|
# If a parser error happens during remotely processing the file, this means the file is corrupted. This case is handled by the parse_records method, so just rethrow.
|
@@ -336,7 +363,11 @@ class UnstructuredParser(FileTypeParser):
|
|
336
363
|
|
337
364
|
return self._render_markdown([element.to_dict() for element in elements])
|
338
365
|
|
339
|
-
def _create_parse_error(
|
366
|
+
def _create_parse_error(
|
367
|
+
self,
|
368
|
+
remote_file: RemoteFile,
|
369
|
+
message: str,
|
370
|
+
) -> RecordParseError:
|
340
371
|
return RecordParseError(
|
341
372
|
FileBasedSourceError.ERROR_PARSING_RECORD, filename=remote_file.uri, message=message
|
342
373
|
)
|
@@ -360,32 +391,51 @@ class UnstructuredParser(FileTypeParser):
|
|
360
391
|
# detect_filetype is either using the file name or file content
|
361
392
|
# if possible, try to leverage the file name to detect the file type
|
362
393
|
# if the file name is not available, use the file content
|
363
|
-
file_type =
|
364
|
-
|
365
|
-
|
366
|
-
|
394
|
+
file_type: FileType | None = None
|
395
|
+
try:
|
396
|
+
file_type = detect_filetype(
|
397
|
+
filename=remote_file.uri,
|
398
|
+
)
|
399
|
+
except Exception:
|
400
|
+
# Path doesn't exist locally. Try something else...
|
401
|
+
pass
|
402
|
+
|
403
|
+
if file_type and file_type != FileType.UNK:
|
367
404
|
return file_type
|
368
405
|
|
369
406
|
type_based_on_content = detect_filetype(file=file)
|
407
|
+
file.seek(0) # detect_filetype is reading to read the file content, so we need to reset
|
370
408
|
|
371
|
-
|
372
|
-
|
409
|
+
if type_based_on_content and type_based_on_content != FileType.UNK:
|
410
|
+
return type_based_on_content
|
373
411
|
|
374
|
-
|
412
|
+
extension = "." + remote_file.uri.split(".")[-1].lower()
|
413
|
+
if extension in EXT_TO_FILETYPE:
|
414
|
+
return EXT_TO_FILETYPE[extension]
|
415
|
+
|
416
|
+
return None
|
375
417
|
|
376
418
|
def _supported_file_types(self) -> List[Any]:
|
377
419
|
return [FileType.MD, FileType.PDF, FileType.DOCX, FileType.PPTX, FileType.TXT]
|
378
420
|
|
379
|
-
def _get_file_type_error_message(
|
421
|
+
def _get_file_type_error_message(
|
422
|
+
self,
|
423
|
+
file_type: FileType | None,
|
424
|
+
) -> str:
|
380
425
|
supported_file_types = ", ".join([str(type) for type in self._supported_file_types()])
|
381
|
-
return f"File type {file_type} is not supported. Supported file types are {supported_file_types}"
|
426
|
+
return f"File type {file_type or 'None'!s} is not supported. Supported file types are {supported_file_types}"
|
382
427
|
|
383
428
|
def _render_markdown(self, elements: List[Any]) -> str:
|
384
429
|
return "\n\n".join((self._convert_to_markdown(el) for el in elements))
|
385
430
|
|
386
431
|
def _convert_to_markdown(self, el: Dict[str, Any]) -> str:
|
387
432
|
if dpath.get(el, "type") == "Title":
|
388
|
-
|
433
|
+
category_depth = dpath.get(el, "metadata/category_depth", default=1) or 1
|
434
|
+
if not isinstance(category_depth, int):
|
435
|
+
category_depth = (
|
436
|
+
int(category_depth) if isinstance(category_depth, (str, float)) else 1
|
437
|
+
)
|
438
|
+
heading_str = "#" * category_depth
|
389
439
|
return f"{heading_str} {dpath.get(el, 'text')}"
|
390
440
|
elif dpath.get(el, "type") == "ListItem":
|
391
441
|
return f"- {dpath.get(el, 'text')}"
|
@@ -1,13 +1,13 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: airbyte-cdk
|
3
|
-
Version: 6.6.
|
3
|
+
Version: 6.6.2
|
4
4
|
Summary: A framework for writing Airbyte Connectors.
|
5
5
|
Home-page: https://airbyte.com
|
6
6
|
License: MIT
|
7
7
|
Keywords: airbyte,connector-development-kit,cdk
|
8
8
|
Author: Airbyte
|
9
9
|
Author-email: contact@airbyte.io
|
10
|
-
Requires-Python: >=3.10,<
|
10
|
+
Requires-Python: >=3.10,<3.13
|
11
11
|
Classifier: Development Status :: 3 - Alpha
|
12
12
|
Classifier: Intended Audience :: Developers
|
13
13
|
Classifier: License :: OSI Approved :: MIT License
|
@@ -15,7 +15,6 @@ Classifier: Programming Language :: Python :: 3
|
|
15
15
|
Classifier: Programming Language :: Python :: 3.10
|
16
16
|
Classifier: Programming Language :: Python :: 3.11
|
17
17
|
Classifier: Programming Language :: Python :: 3.12
|
18
|
-
Classifier: Programming Language :: Python :: 3.13
|
19
18
|
Classifier: Topic :: Scientific/Engineering
|
20
19
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
21
20
|
Provides-Extra: file-based
|
@@ -42,7 +41,7 @@ Requires-Dist: jsonschema (>=3.2.0,<3.3.0)
|
|
42
41
|
Requires-Dist: langchain (==0.1.16) ; extra == "vector-db-based"
|
43
42
|
Requires-Dist: langchain_core (==0.1.42)
|
44
43
|
Requires-Dist: markdown ; extra == "file-based"
|
45
|
-
Requires-Dist: nltk (==3.
|
44
|
+
Requires-Dist: nltk (==3.9.1)
|
46
45
|
Requires-Dist: numpy (<2)
|
47
46
|
Requires-Dist: openai[embeddings] (==0.27.9) ; extra == "vector-db-based"
|
48
47
|
Requires-Dist: orjson (>=3.10.7,<4.0.0)
|
@@ -202,7 +202,7 @@ airbyte_cdk/sources/file_based/file_types/file_transfer.py,sha256=HyGRihJxcb_lEs
|
|
202
202
|
airbyte_cdk/sources/file_based/file_types/file_type_parser.py,sha256=JgpH21PrbRqwK92BJklZWvh2TndA6xZ-eP1LPMo44oQ,2832
|
203
203
|
airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=k1ri7TtwrN8oYZpCl1bNNeAQmwBbwLjmOmIz8-tKflY,5897
|
204
204
|
airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=0B4RYehU4z4dys3Tu-O98B0Uw7JO_LzStRwmNxKh6Xk,10486
|
205
|
-
airbyte_cdk/sources/file_based/file_types/unstructured_parser.py,sha256=
|
205
|
+
airbyte_cdk/sources/file_based/file_types/unstructured_parser.py,sha256=3dlhzKq4Vuc-E01uG5TLPAn0E8Be2SrBNukeP22RNFM,18601
|
206
206
|
airbyte_cdk/sources/file_based/remote_file.py,sha256=yqRz93vPe8PBXLIMJ5W5u2JRlZRhg6sBrAjn3pPjJ8A,315
|
207
207
|
airbyte_cdk/sources/file_based/schema_helpers.py,sha256=Cf8FH1bDFP0qCDDfEYir_WjP4exXUnikz8hZ40y1Ek0,9601
|
208
208
|
airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py,sha256=sEVnRhZ8x9f7PNjo6lewxid9z0PI8eSj7gSoFC3MH1Y,527
|
@@ -327,7 +327,7 @@ airbyte_cdk/utils/slice_hasher.py,sha256=EemcgcQlI8-LPYOPlYv4Qkdjyho79XVLWaUHF5X
|
|
327
327
|
airbyte_cdk/utils/spec_schema_transformations.py,sha256=LVc9KbtMeV_z99jWo0Ou8u4l6eBJ0BWNhxj4zrrGKRs,763
|
328
328
|
airbyte_cdk/utils/stream_status_utils.py,sha256=ZmBoiy5HVbUEHAMrUONxZvxnvfV9CesmQJLDTAIWnWw,1171
|
329
329
|
airbyte_cdk/utils/traced_exception.py,sha256=89TQdFuYZ1NJgmFpqLzY_T_T_64TpJYmVqs119Bp43g,6164
|
330
|
-
airbyte_cdk-6.6.
|
331
|
-
airbyte_cdk-6.6.
|
332
|
-
airbyte_cdk-6.6.
|
333
|
-
airbyte_cdk-6.6.
|
330
|
+
airbyte_cdk-6.6.2.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
|
331
|
+
airbyte_cdk-6.6.2.dist-info/METADATA,sha256=mgsuZ-wB8P42xXmulcAD8izuKL-uTM-2IPbsFlFdoCw,13297
|
332
|
+
airbyte_cdk-6.6.2.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
333
|
+
airbyte_cdk-6.6.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|