airbyte-cdk 6.6.1__py3-none-any.whl → 6.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -29,16 +29,25 @@ from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
29
29
  from airbyte_cdk.utils import is_cloud_environment
30
30
  from airbyte_cdk.utils.traced_exception import AirbyteTracedException
31
31
  from unstructured.file_utils.filetype import (
32
+ EXT_TO_FILETYPE,
32
33
  FILETYPE_TO_MIMETYPE,
33
34
  STR_TO_FILETYPE,
34
35
  FileType,
35
36
  detect_filetype,
36
37
  )
38
+ import nltk
37
39
 
38
40
  unstructured_partition_pdf = None
39
41
  unstructured_partition_docx = None
40
42
  unstructured_partition_pptx = None
41
43
 
44
+ try:
45
+ nltk.data.find("tokenizers/punkt.zip")
46
+ nltk.data.find("tokenizers/punkt_tab.zip")
47
+ except LookupError:
48
+ nltk.download("punkt")
49
+ nltk.download("punkt_tab")
50
+
42
51
 
43
52
  def optional_decode(contents: Union[str, bytes]) -> str:
44
53
  if isinstance(contents, bytes):
@@ -108,9 +117,11 @@ class UnstructuredParser(FileTypeParser):
108
117
  format = _extract_format(config)
109
118
  with stream_reader.open_file(file, self.file_read_mode, None, logger) as file_handle:
110
119
  filetype = self._get_filetype(file_handle, file)
111
-
112
120
  if filetype not in self._supported_file_types() and not format.skip_unprocessable_files:
113
- raise self._create_parse_error(file, self._get_file_type_error_message(filetype))
121
+ raise self._create_parse_error(
122
+ file,
123
+ self._get_file_type_error_message(filetype),
124
+ )
114
125
 
115
126
  return {
116
127
  "content": {
@@ -159,6 +170,10 @@ class UnstructuredParser(FileTypeParser):
159
170
  logger.warn(f"File {file.uri} cannot be parsed. Skipping it.")
160
171
  else:
161
172
  raise e
173
+ except Exception as e:
174
+ exception_str = str(e)
175
+ logger.error(f"File {file.uri} caused an error during parsing: {exception_str}.")
176
+ raise e
162
177
 
163
178
  def _read_file(
164
179
  self,
@@ -176,20 +191,32 @@ class UnstructuredParser(FileTypeParser):
176
191
  # check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point)
177
192
  raise Exception("unstructured library is not available")
178
193
 
179
- filetype = self._get_filetype(file_handle, remote_file)
194
+ filetype: FileType | None = self._get_filetype(file_handle, remote_file)
180
195
 
181
- if filetype == FileType.MD or filetype == FileType.TXT:
196
+ if filetype is None or filetype not in self._supported_file_types():
197
+ raise self._create_parse_error(
198
+ remote_file,
199
+ self._get_file_type_error_message(filetype),
200
+ )
201
+ if filetype in {FileType.MD, FileType.TXT}:
182
202
  file_content: bytes = file_handle.read()
183
203
  decoded_content: str = optional_decode(file_content)
184
204
  return decoded_content
185
- if filetype not in self._supported_file_types():
186
- raise self._create_parse_error(remote_file, self._get_file_type_error_message(filetype))
187
205
  if format.processing.mode == "local":
188
- return self._read_file_locally(file_handle, filetype, format.strategy, remote_file)
206
+ return self._read_file_locally(
207
+ file_handle,
208
+ filetype,
209
+ format.strategy,
210
+ remote_file,
211
+ )
189
212
  elif format.processing.mode == "api":
190
213
  try:
191
214
  result: str = self._read_file_remotely_with_retries(
192
- file_handle, format.processing, filetype, format.strategy, remote_file
215
+ file_handle,
216
+ format.processing,
217
+ filetype,
218
+ format.strategy,
219
+ remote_file,
193
220
  )
194
221
  except Exception as e:
195
222
  # If a parser error happens during remotely processing the file, this means the file is corrupted. This case is handled by the parse_records method, so just rethrow.
@@ -336,7 +363,11 @@ class UnstructuredParser(FileTypeParser):
336
363
 
337
364
  return self._render_markdown([element.to_dict() for element in elements])
338
365
 
339
- def _create_parse_error(self, remote_file: RemoteFile, message: str) -> RecordParseError:
366
+ def _create_parse_error(
367
+ self,
368
+ remote_file: RemoteFile,
369
+ message: str,
370
+ ) -> RecordParseError:
340
371
  return RecordParseError(
341
372
  FileBasedSourceError.ERROR_PARSING_RECORD, filename=remote_file.uri, message=message
342
373
  )
@@ -360,32 +391,51 @@ class UnstructuredParser(FileTypeParser):
360
391
  # detect_filetype is either using the file name or file content
361
392
  # if possible, try to leverage the file name to detect the file type
362
393
  # if the file name is not available, use the file content
363
- file_type = detect_filetype(
364
- filename=remote_file.uri,
365
- )
366
- if file_type is not None and not file_type == FileType.UNK:
394
+ file_type: FileType | None = None
395
+ try:
396
+ file_type = detect_filetype(
397
+ filename=remote_file.uri,
398
+ )
399
+ except Exception:
400
+ # Path doesn't exist locally. Try something else...
401
+ pass
402
+
403
+ if file_type and file_type != FileType.UNK:
367
404
  return file_type
368
405
 
369
406
  type_based_on_content = detect_filetype(file=file)
407
+ file.seek(0) # detect_filetype is reading to read the file content, so we need to reset
370
408
 
371
- # detect_filetype is reading to read the file content
372
- file.seek(0)
409
+ if type_based_on_content and type_based_on_content != FileType.UNK:
410
+ return type_based_on_content
373
411
 
374
- return type_based_on_content
412
+ extension = "." + remote_file.uri.split(".")[-1].lower()
413
+ if extension in EXT_TO_FILETYPE:
414
+ return EXT_TO_FILETYPE[extension]
415
+
416
+ return None
375
417
 
376
418
  def _supported_file_types(self) -> List[Any]:
377
419
  return [FileType.MD, FileType.PDF, FileType.DOCX, FileType.PPTX, FileType.TXT]
378
420
 
379
- def _get_file_type_error_message(self, file_type: FileType) -> str:
421
+ def _get_file_type_error_message(
422
+ self,
423
+ file_type: FileType | None,
424
+ ) -> str:
380
425
  supported_file_types = ", ".join([str(type) for type in self._supported_file_types()])
381
- return f"File type {file_type} is not supported. Supported file types are {supported_file_types}"
426
+ return f"File type {file_type or 'None'!s} is not supported. Supported file types are {supported_file_types}"
382
427
 
383
428
  def _render_markdown(self, elements: List[Any]) -> str:
384
429
  return "\n\n".join((self._convert_to_markdown(el) for el in elements))
385
430
 
386
431
  def _convert_to_markdown(self, el: Dict[str, Any]) -> str:
387
432
  if dpath.get(el, "type") == "Title":
388
- heading_str = "#" * (dpath.get(el, "metadata/category_depth", default=1) or 1)
433
+ category_depth = dpath.get(el, "metadata/category_depth", default=1) or 1
434
+ if not isinstance(category_depth, int):
435
+ category_depth = (
436
+ int(category_depth) if isinstance(category_depth, (str, float)) else 1
437
+ )
438
+ heading_str = "#" * category_depth
389
439
  return f"{heading_str} {dpath.get(el, 'text')}"
390
440
  elif dpath.get(el, "type") == "ListItem":
391
441
  return f"- {dpath.get(el, 'text')}"
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: airbyte-cdk
3
- Version: 6.6.1
3
+ Version: 6.6.2
4
4
  Summary: A framework for writing Airbyte Connectors.
5
5
  Home-page: https://airbyte.com
6
6
  License: MIT
7
7
  Keywords: airbyte,connector-development-kit,cdk
8
8
  Author: Airbyte
9
9
  Author-email: contact@airbyte.io
10
- Requires-Python: >=3.10,<4.0
10
+ Requires-Python: >=3.10,<3.13
11
11
  Classifier: Development Status :: 3 - Alpha
12
12
  Classifier: Intended Audience :: Developers
13
13
  Classifier: License :: OSI Approved :: MIT License
@@ -15,7 +15,6 @@ Classifier: Programming Language :: Python :: 3
15
15
  Classifier: Programming Language :: Python :: 3.10
16
16
  Classifier: Programming Language :: Python :: 3.11
17
17
  Classifier: Programming Language :: Python :: 3.12
18
- Classifier: Programming Language :: Python :: 3.13
19
18
  Classifier: Topic :: Scientific/Engineering
20
19
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
20
  Provides-Extra: file-based
@@ -42,7 +41,7 @@ Requires-Dist: jsonschema (>=3.2.0,<3.3.0)
42
41
  Requires-Dist: langchain (==0.1.16) ; extra == "vector-db-based"
43
42
  Requires-Dist: langchain_core (==0.1.42)
44
43
  Requires-Dist: markdown ; extra == "file-based"
45
- Requires-Dist: nltk (==3.8.1)
44
+ Requires-Dist: nltk (==3.9.1)
46
45
  Requires-Dist: numpy (<2)
47
46
  Requires-Dist: openai[embeddings] (==0.27.9) ; extra == "vector-db-based"
48
47
  Requires-Dist: orjson (>=3.10.7,<4.0.0)
@@ -202,7 +202,7 @@ airbyte_cdk/sources/file_based/file_types/file_transfer.py,sha256=HyGRihJxcb_lEs
202
202
  airbyte_cdk/sources/file_based/file_types/file_type_parser.py,sha256=JgpH21PrbRqwK92BJklZWvh2TndA6xZ-eP1LPMo44oQ,2832
203
203
  airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=k1ri7TtwrN8oYZpCl1bNNeAQmwBbwLjmOmIz8-tKflY,5897
204
204
  airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=0B4RYehU4z4dys3Tu-O98B0Uw7JO_LzStRwmNxKh6Xk,10486
205
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py,sha256=oR0XdsMLpOh9kXzkVuqZbIfxzsREeWYCWpWY2vlyVHk,17171
205
+ airbyte_cdk/sources/file_based/file_types/unstructured_parser.py,sha256=3dlhzKq4Vuc-E01uG5TLPAn0E8Be2SrBNukeP22RNFM,18601
206
206
  airbyte_cdk/sources/file_based/remote_file.py,sha256=yqRz93vPe8PBXLIMJ5W5u2JRlZRhg6sBrAjn3pPjJ8A,315
207
207
  airbyte_cdk/sources/file_based/schema_helpers.py,sha256=Cf8FH1bDFP0qCDDfEYir_WjP4exXUnikz8hZ40y1Ek0,9601
208
208
  airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py,sha256=sEVnRhZ8x9f7PNjo6lewxid9z0PI8eSj7gSoFC3MH1Y,527
@@ -327,7 +327,7 @@ airbyte_cdk/utils/slice_hasher.py,sha256=EemcgcQlI8-LPYOPlYv4Qkdjyho79XVLWaUHF5X
327
327
  airbyte_cdk/utils/spec_schema_transformations.py,sha256=LVc9KbtMeV_z99jWo0Ou8u4l6eBJ0BWNhxj4zrrGKRs,763
328
328
  airbyte_cdk/utils/stream_status_utils.py,sha256=ZmBoiy5HVbUEHAMrUONxZvxnvfV9CesmQJLDTAIWnWw,1171
329
329
  airbyte_cdk/utils/traced_exception.py,sha256=89TQdFuYZ1NJgmFpqLzY_T_T_64TpJYmVqs119Bp43g,6164
330
- airbyte_cdk-6.6.1.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
331
- airbyte_cdk-6.6.1.dist-info/METADATA,sha256=y0IouLNV_Hs3TtkCZToJcltP4_NaFV0Yrtzguh5yPvc,13347
332
- airbyte_cdk-6.6.1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
333
- airbyte_cdk-6.6.1.dist-info/RECORD,,
330
+ airbyte_cdk-6.6.2.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
331
+ airbyte_cdk-6.6.2.dist-info/METADATA,sha256=mgsuZ-wB8P42xXmulcAD8izuKL-uTM-2IPbsFlFdoCw,13297
332
+ airbyte_cdk-6.6.2.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
333
+ airbyte_cdk-6.6.2.dist-info/RECORD,,