unstructured-ingest 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (125) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/cli.py +6 -1
  3. unstructured_ingest/cli/cmds/__init__.py +4 -4
  4. unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
  5. unstructured_ingest/cli/interfaces.py +13 -6
  6. unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
  7. unstructured_ingest/connector/biomed.py +12 -5
  8. unstructured_ingest/connector/confluence.py +3 -3
  9. unstructured_ingest/connector/github.py +3 -2
  10. unstructured_ingest/connector/google_drive.py +1 -2
  11. unstructured_ingest/connector/mongodb.py +1 -2
  12. unstructured_ingest/connector/notion/client.py +31 -16
  13. unstructured_ingest/connector/notion/connector.py +3 -2
  14. unstructured_ingest/connector/registry.py +2 -2
  15. unstructured_ingest/connector/vectara.py +7 -2
  16. unstructured_ingest/interfaces.py +13 -9
  17. unstructured_ingest/pipeline/interfaces.py +8 -3
  18. unstructured_ingest/pipeline/reformat/chunking.py +13 -9
  19. unstructured_ingest/pipeline/reformat/embedding.py +3 -3
  20. unstructured_ingest/runner/__init__.py +2 -2
  21. unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
  22. unstructured_ingest/runner/writers/__init__.py +2 -2
  23. unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
  24. unstructured_ingest/utils/chunking.py +45 -0
  25. unstructured_ingest/utils/dep_check.py +1 -1
  26. unstructured_ingest/utils/google_filetype.py +9 -0
  27. unstructured_ingest/v2/cli/base/cmd.py +57 -13
  28. unstructured_ingest/v2/cli/base/dest.py +21 -12
  29. unstructured_ingest/v2/cli/base/src.py +35 -23
  30. unstructured_ingest/v2/cli/cmds.py +14 -0
  31. unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
  32. unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
  33. unstructured_ingest/v2/interfaces/connector.py +5 -7
  34. unstructured_ingest/v2/interfaces/downloader.py +8 -5
  35. unstructured_ingest/v2/interfaces/file_data.py +8 -2
  36. unstructured_ingest/v2/interfaces/indexer.py +3 -4
  37. unstructured_ingest/v2/interfaces/processor.py +10 -10
  38. unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
  39. unstructured_ingest/v2/interfaces/uploader.py +3 -3
  40. unstructured_ingest/v2/pipeline/pipeline.py +9 -6
  41. unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
  42. unstructured_ingest/v2/pipeline/steps/download.py +13 -11
  43. unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
  44. unstructured_ingest/v2/pipeline/steps/filter.py +1 -6
  45. unstructured_ingest/v2/pipeline/steps/index.py +14 -10
  46. unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
  47. unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
  48. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
  49. unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
  50. unstructured_ingest/v2/processes/__init__.py +18 -0
  51. unstructured_ingest/v2/processes/chunker.py +74 -28
  52. unstructured_ingest/v2/processes/connector_registry.py +8 -2
  53. unstructured_ingest/v2/processes/connectors/__init__.py +18 -3
  54. unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +46 -39
  55. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +30 -27
  56. unstructured_ingest/v2/processes/connectors/chroma.py +30 -21
  57. unstructured_ingest/v2/processes/connectors/couchbase.py +333 -0
  58. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +87 -32
  59. unstructured_ingest/v2/processes/connectors/elasticsearch.py +70 -45
  60. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +39 -16
  61. unstructured_ingest/v2/processes/connectors/fsspec/box.py +15 -13
  62. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +10 -11
  63. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +20 -34
  64. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +38 -13
  65. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +31 -17
  66. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -28
  67. unstructured_ingest/v2/processes/connectors/google_drive.py +40 -34
  68. unstructured_ingest/v2/processes/connectors/kdbai.py +170 -0
  69. unstructured_ingest/v2/processes/connectors/local.py +27 -16
  70. unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
  71. unstructured_ingest/v2/processes/connectors/mongodb.py +22 -18
  72. unstructured_ingest/v2/processes/connectors/onedrive.py +17 -14
  73. unstructured_ingest/v2/processes/connectors/opensearch.py +66 -56
  74. unstructured_ingest/v2/processes/connectors/pinecone.py +22 -21
  75. unstructured_ingest/v2/processes/connectors/salesforce.py +26 -18
  76. unstructured_ingest/v2/processes/connectors/sharepoint.py +51 -26
  77. unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
  78. unstructured_ingest/v2/processes/connectors/sql.py +29 -31
  79. unstructured_ingest/v2/processes/connectors/weaviate.py +22 -13
  80. unstructured_ingest/v2/processes/embedder.py +106 -47
  81. unstructured_ingest/v2/processes/filter.py +11 -5
  82. unstructured_ingest/v2/processes/partitioner.py +79 -33
  83. unstructured_ingest/v2/processes/uncompress.py +3 -3
  84. unstructured_ingest/v2/utils.py +45 -0
  85. unstructured_ingest-0.0.5.dist-info/LICENSE.md +201 -0
  86. unstructured_ingest-0.0.5.dist-info/METADATA +574 -0
  87. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/RECORD +91 -116
  88. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/WHEEL +1 -1
  89. unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
  90. unstructured_ingest/v2/cli/cmds/astra.py +0 -85
  91. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
  92. unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
  93. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
  94. unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
  95. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
  96. unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
  97. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
  98. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -69
  99. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
  100. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
  101. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
  102. unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
  103. unstructured_ingest/v2/cli/cmds/local.py +0 -52
  104. unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
  105. unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
  106. unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
  107. unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
  108. unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
  109. unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
  110. unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
  111. unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
  112. unstructured_ingest/v2/cli/cmds/sql.py +0 -84
  113. unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
  114. unstructured_ingest/v2/cli/configs/__init__.py +0 -13
  115. unstructured_ingest/v2/cli/configs/chunk.py +0 -89
  116. unstructured_ingest/v2/cli/configs/embed.py +0 -74
  117. unstructured_ingest/v2/cli/configs/filter.py +0 -28
  118. unstructured_ingest/v2/cli/configs/partition.py +0 -99
  119. unstructured_ingest/v2/cli/configs/processor.py +0 -88
  120. unstructured_ingest/v2/cli/interfaces.py +0 -27
  121. unstructured_ingest/v2/pipeline/utils.py +0 -15
  122. unstructured_ingest-0.0.3.dist-info/METADATA +0 -175
  123. /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  124. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/entry_points.txt +0 -0
  125. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/top_level.txt +0 -0
@@ -9,7 +9,7 @@ from unstructured_ingest.v2.interfaces import FileData, download_responses
9
9
  from unstructured_ingest.v2.interfaces.downloader import Downloader
10
10
  from unstructured_ingest.v2.logger import logger
11
11
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
12
- from unstructured_ingest.v2.pipeline.utils import sterilize_dict
12
+ from unstructured_ingest.v2.utils import serialize_base_model_json
13
13
 
14
14
  DownloaderT = TypeVar("DownloaderT", bound=Downloader)
15
15
 
@@ -30,15 +30,9 @@ class DownloadStep(PipelineStep):
30
30
  return f"{self.identifier} ({self.process.__class__.__name__})"
31
31
 
32
32
  def __post_init__(self):
33
- config = (
34
- sterilize_dict(self.process.download_config.to_dict(redact_sensitive=True))
35
- if self.process.download_config
36
- else None
37
- )
33
+ config = self.process.download_config.json() if self.process.download_config else None
38
34
  connection_config = (
39
- sterilize_dict(self.process.connection_config.to_dict(redact_sensitive=True))
40
- if self.process.connection_config
41
- else None
35
+ self.process.connection_config.json() if self.process.connection_config else None
42
36
  )
43
37
  logger.info(
44
38
  f"Created {self.identifier} with configs: {config}, "
@@ -180,9 +174,17 @@ class DownloadStep(PipelineStep):
180
174
  return str(filepath)
181
175
 
182
176
  def get_hash(self, extras: Optional[list[str]]) -> str:
183
- hashable_string = json.dumps(
184
- sterilize_dict(self.process.download_config.to_dict()), sort_keys=True
177
+ download_config_dict = json.loads(
178
+ serialize_base_model_json(model=self.process.download_config)
179
+ )
180
+ connection_config_dict = json.loads(
181
+ serialize_base_model_json(model=self.process.connection_config)
185
182
  )
183
+ hashable_dict = {
184
+ "download_config": download_config_dict,
185
+ "connection_config": connection_config_dict,
186
+ }
187
+ hashable_string = json.dumps(hashable_dict, sort_keys=True)
186
188
  if extras:
187
189
  hashable_string += "".join(extras)
188
190
  return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
@@ -5,13 +5,11 @@ from dataclasses import dataclass
5
5
  from pathlib import Path
6
6
  from typing import Callable, Optional, TypedDict
7
7
 
8
- from unstructured.staging.base import elements_to_dicts
9
-
10
8
  from unstructured_ingest.v2.interfaces import FileData
11
9
  from unstructured_ingest.v2.logger import logger
12
10
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
13
- from unstructured_ingest.v2.pipeline.utils import sterilize_dict
14
11
  from unstructured_ingest.v2.processes.embedder import Embedder
12
+ from unstructured_ingest.v2.utils import serialize_base_model_json
15
13
 
16
14
  STEP_ID = "embed"
17
15
 
@@ -30,11 +28,7 @@ class EmbedStep(PipelineStep):
30
28
  return f"{self.identifier} ({self.process.config.embedding_provider})"
31
29
 
32
30
  def __post_init__(self):
33
- config = (
34
- sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
35
- if self.process.config
36
- else None
37
- )
31
+ config = self.process.config.json() if self.process.config else None
38
32
  logger.info(f"Created {self.identifier} with configs: {config}")
39
33
 
40
34
  def should_embed(self, filepath: Path, file_data: FileData) -> bool:
@@ -71,13 +65,13 @@ class EmbedStep(PipelineStep):
71
65
 
72
66
  self._save_output(
73
67
  output_filepath=str(output_filepath),
74
- embedded_content=elements_to_dicts(embed_content_raw),
68
+ embedded_content=embed_content_raw,
75
69
  )
76
70
  return EmbedStepResponse(file_data_path=file_data_path, path=str(output_filepath))
77
71
 
78
72
  def get_hash(self, extras: Optional[list[str]]) -> str:
79
- hashable_string = json.dumps(
80
- self.process.config.to_dict(), sort_keys=True, ensure_ascii=True
73
+ hashable_string = serialize_base_model_json(
74
+ model=self.process.config, sort_keys=True, ensure_ascii=True
81
75
  )
82
76
  if extras:
83
77
  hashable_string += "".join(extras)
@@ -5,7 +5,6 @@ from typing import Callable, Optional
5
5
  from unstructured_ingest.v2.interfaces.file_data import FileData
6
6
  from unstructured_ingest.v2.logger import logger
7
7
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
8
- from unstructured_ingest.v2.pipeline.utils import sterilize_dict
9
8
  from unstructured_ingest.v2.processes.filter import Filterer
10
9
 
11
10
  STEP_ID = "filter"
@@ -17,11 +16,7 @@ class FilterStep(PipelineStep):
17
16
  identifier: str = STEP_ID
18
17
 
19
18
  def __post_init__(self):
20
- config = (
21
- sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
22
- if self.process.config
23
- else None
24
- )
19
+ config = self.process.config.json() if self.process.config else None
25
20
  logger.info(f"Created {self.identifier} with configs: {config}")
26
21
 
27
22
  async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:
@@ -6,7 +6,7 @@ from typing import Generator, Optional, TypeVar
6
6
  from unstructured_ingest.v2.interfaces.indexer import Indexer
7
7
  from unstructured_ingest.v2.logger import logger
8
8
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
9
- from unstructured_ingest.v2.pipeline.utils import sterilize_dict
9
+ from unstructured_ingest.v2.utils import serialize_base_model_json
10
10
 
11
11
  IndexerT = TypeVar("IndexerT", bound=Indexer)
12
12
 
@@ -22,15 +22,9 @@ class IndexStep(PipelineStep):
22
22
  return f"{self.identifier} ({self.process.__class__.__name__})"
23
23
 
24
24
  def __post_init__(self):
25
- config = (
26
- sterilize_dict(self.process.index_config.to_dict(redact_sensitive=True))
27
- if self.process.index_config
28
- else None
29
- )
25
+ config = self.process.index_config.json() if self.process.index_config else None
30
26
  connection_config = (
31
- sterilize_dict(self.process.connection_config.to_dict(redact_sensitive=True))
32
- if self.process.connection_config
33
- else None
27
+ self.process.connection_config.json() if self.process.connection_config else None
34
28
  )
35
29
  logger.info(
36
30
  f"Created {self.identifier} with configs: {config}, "
@@ -55,7 +49,17 @@ class IndexStep(PipelineStep):
55
49
  continue
56
50
 
57
51
  def get_hash(self, extras: Optional[list[str]]) -> str:
58
- hashable_string = json.dumps(self.process.index_config.to_dict())
52
+ index_config_dict = json.loads(
53
+ serialize_base_model_json(model=self.process.index_config, sort_keys=True)
54
+ )
55
+ connection_config_dict = json.loads(
56
+ serialize_base_model_json(model=self.process.connection_config, sort_keys=True)
57
+ )
58
+ hashable_dict = {
59
+ "index_config": index_config_dict,
60
+ "connection_config": connection_config_dict,
61
+ }
62
+ hashable_string = json.dumps(hashable_dict, sort_keys=True)
59
63
  if extras:
60
64
  hashable_string += "".join(extras)
61
65
  return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
@@ -8,8 +8,8 @@ from typing import Callable, Optional, TypedDict
8
8
  from unstructured_ingest.v2.interfaces import FileData
9
9
  from unstructured_ingest.v2.logger import logger
10
10
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
11
- from unstructured_ingest.v2.pipeline.utils import sterilize_dict
12
11
  from unstructured_ingest.v2.processes.partitioner import Partitioner
12
+ from unstructured_ingest.v2.utils import serialize_base_model_json
13
13
 
14
14
  STEP_ID = "partition"
15
15
 
@@ -28,7 +28,7 @@ class PartitionStep(PipelineStep):
28
28
  return f"{self.identifier} ({self.process.config.strategy})"
29
29
 
30
30
  def __post_init__(self):
31
- config = sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
31
+ config = self.process.config.json()
32
32
  logger.info(f"Created {self.identifier} with configs: {config}")
33
33
 
34
34
  def should_partition(self, filepath: Path, file_data: FileData) -> bool:
@@ -56,7 +56,7 @@ class PartitionStep(PipelineStep):
56
56
  if not self.should_partition(filepath=output_filepath, file_data=file_data):
57
57
  logger.debug(f"Skipping partitioning, output already exists: {output_filepath}")
58
58
  return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
59
- fn_kwargs = {"filename": path, "metadata": file_data.metadata}
59
+ fn_kwargs = {"filename": path, "metadata": file_data.metadata.to_dict()}
60
60
  if not asyncio.iscoroutinefunction(fn):
61
61
  partitioned_content = fn(**fn_kwargs)
62
62
  elif semaphore := self.context.semaphore:
@@ -70,8 +70,8 @@ class PartitionStep(PipelineStep):
70
70
  return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
71
71
 
72
72
  def get_hash(self, extras: Optional[list[str]]) -> str:
73
- hashable_string = json.dumps(
74
- self.process.config.to_dict(), sort_keys=True, ensure_ascii=True
73
+ hashable_string = serialize_base_model_json(
74
+ model=self.process.config, sort_keys=True, ensure_ascii=True
75
75
  )
76
76
  if extras:
77
77
  hashable_string += "".join(extras)
@@ -1,6 +1,5 @@
1
1
  import asyncio
2
2
  import hashlib
3
- import json
4
3
  from dataclasses import dataclass
5
4
  from pathlib import Path
6
5
  from typing import Callable, Optional, TypedDict
@@ -9,7 +8,7 @@ from unstructured_ingest.v2.interfaces.file_data import FileData
9
8
  from unstructured_ingest.v2.interfaces.upload_stager import UploadStager
10
9
  from unstructured_ingest.v2.logger import logger
11
10
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
12
- from unstructured_ingest.v2.pipeline.utils import sterilize_dict
11
+ from unstructured_ingest.v2.utils import serialize_base_model_json
13
12
 
14
13
  STEP_ID = "upload_stage"
15
14
 
@@ -29,9 +28,7 @@ class UploadStageStep(PipelineStep):
29
28
 
30
29
  def __post_init__(self):
31
30
  config = (
32
- sterilize_dict(self.process.upload_stager_config.to_dict(redact_sensitive=True))
33
- if self.process.upload_stager_config
34
- else None
31
+ self.process.upload_stager_config.json() if self.process.upload_stager_config else None
35
32
  )
36
33
  self.cache_dir.mkdir(parents=True, exist_ok=True)
37
34
  logger.info(f"Created {self.identifier} with configs: {config}")
@@ -56,8 +53,8 @@ class UploadStageStep(PipelineStep):
56
53
  return UploadStageStepResponse(file_data_path=file_data_path, path=str(staged_output_path))
57
54
 
58
55
  def get_hash(self, extras: Optional[list[str]]) -> str:
59
- hashable_string = json.dumps(
60
- self.process.upload_stager_config.to_dict(), sort_keys=True, ensure_ascii=True
56
+ hashable_string = serialize_base_model_json(
57
+ model=self.process.upload_stager_config, sort_keys=True, ensure_ascii=True
61
58
  )
62
59
  if extras:
63
60
  hashable_string += "".join(extras)
@@ -5,7 +5,6 @@ from typing import Callable, TypedDict
5
5
  from unstructured_ingest.v2.interfaces.file_data import FileData
6
6
  from unstructured_ingest.v2.logger import logger
7
7
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
8
- from unstructured_ingest.v2.pipeline.utils import sterilize_dict
9
8
  from unstructured_ingest.v2.processes.uncompress import Uncompressor
10
9
 
11
10
  STEP_ID = "uncompress"
@@ -21,11 +20,7 @@ class UncompressStep(PipelineStep):
21
20
  identifier: str = STEP_ID
22
21
 
23
22
  def __post_init__(self):
24
- config = (
25
- sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
26
- if self.process.config
27
- else None
28
- )
23
+ config = self.process.config.json() if self.process.config else None
29
24
  logger.info(f"Created {self.identifier} with configs: {config}")
30
25
 
31
26
  def _run(self, path: str, file_data_path: str) -> list[UncompressStepResponse]:
@@ -7,7 +7,6 @@ from unstructured_ingest.v2.interfaces import FileData
7
7
  from unstructured_ingest.v2.interfaces.uploader import UploadContent, Uploader
8
8
  from unstructured_ingest.v2.logger import logger
9
9
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep, iterable_input, timed
10
- from unstructured_ingest.v2.pipeline.utils import sterilize_dict
11
10
 
12
11
  STEP_ID = "upload"
13
12
 
@@ -26,15 +25,9 @@ class UploadStep(PipelineStep):
26
25
  return f"{self.identifier} ({self.process.__class__.__name__})"
27
26
 
28
27
  def __post_init__(self):
29
- config = (
30
- sterilize_dict(self.process.upload_config.to_dict(redact_sensitive=True))
31
- if self.process.upload_config
32
- else None
33
- )
28
+ config = self.process.upload_config.json() if self.process.upload_config else None
34
29
  connection_config = (
35
- sterilize_dict(self.process.connection_config.to_dict(redact_sensitive=True))
36
- if self.process.connection_config
37
- else None
30
+ self.process.connection_config.json() if self.process.connection_config else None
38
31
  )
39
32
  logger.info(
40
33
  f"Created {self.identifier} with configs: {config}, "
@@ -0,0 +1,18 @@
1
+ from .chunker import Chunker, ChunkerConfig
2
+ from .embedder import Embedder, EmbedderConfig
3
+ from .filter import Filterer, FiltererConfig
4
+ from .partitioner import Partitioner, PartitionerConfig
5
+ from .uncompress import UncompressConfig, Uncompressor
6
+
7
+ __all__ = [
8
+ "Chunker",
9
+ "ChunkerConfig",
10
+ "Embedder",
11
+ "EmbedderConfig",
12
+ "Filterer",
13
+ "FiltererConfig",
14
+ "Partitioner",
15
+ "PartitionerConfig",
16
+ "Uncompressor",
17
+ "UncompressConfig",
18
+ ]
@@ -3,29 +3,70 @@ from dataclasses import dataclass, fields
3
3
  from pathlib import Path
4
4
  from typing import Any, Optional
5
5
 
6
- from unstructured.chunking import dispatch
7
- from unstructured.documents.elements import Element, assign_and_map_hash_ids
8
- from unstructured.staging.base import dict_to_elements, elements_from_json
6
+ from pydantic import BaseModel, Field, SecretStr
9
7
 
10
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
8
+ from unstructured_ingest.utils.chunking import assign_and_map_hash_ids
9
+ from unstructured_ingest.utils.dep_check import requires_dependencies
11
10
  from unstructured_ingest.v2.interfaces.process import BaseProcess
12
11
  from unstructured_ingest.v2.logger import logger
13
12
 
13
+ CHUNK_MAX_CHARS_DEFAULT: int = 500
14
+ CHUNK_MULTI_PAGE_DEFAULT: bool = True
14
15
 
15
- @dataclass
16
- class ChunkerConfig(EnhancedDataClassJsonMixin):
17
- chunking_strategy: Optional[str] = None
18
- chunking_endpoint: Optional[str] = "https://api.unstructured.io/general/v0/general"
19
- chunk_by_api: bool = False
20
- chunk_api_key: Optional[str] = enhanced_field(default=None, sensitive=True)
21
16
 
22
- chunk_combine_text_under_n_chars: Optional[int] = None
23
- chunk_include_orig_elements: Optional[bool] = None
24
- chunk_max_characters: Optional[int] = None
25
- chunk_multipage_sections: Optional[bool] = None
26
- chunk_new_after_n_chars: Optional[int] = None
27
- chunk_overlap: Optional[int] = None
28
- chunk_overlap_all: Optional[bool] = None
17
+ class ChunkerConfig(BaseModel):
18
+ chunking_strategy: Optional[str] = Field(
19
+ default=None, description="The rule-set to use to form chunks. Omit to disable chunking."
20
+ )
21
+ chunking_endpoint: Optional[str] = Field(
22
+ default="https://api.unstructured.io/general/v0/general",
23
+ description="If chunking via api, use the following host.",
24
+ )
25
+ chunk_by_api: bool = Field(default=False, description="Flag to use api for chunking")
26
+ chunk_api_key: Optional[SecretStr] = Field(
27
+ default=None, description="API Key for chunking endpoint."
28
+ )
29
+
30
+ chunk_combine_text_under_n_chars: Optional[int] = Field(
31
+ default=None,
32
+ description="Combine consecutive chunks when the first does not exceed this length and"
33
+ " the second will fit without exceeding the hard-maximum length. Only"
34
+ " operative for 'by_title' chunking-strategy.",
35
+ )
36
+ chunk_include_orig_elements: Optional[bool] = Field(
37
+ default=None,
38
+ description="When chunking, add the original elements consolidated to form each chunk to"
39
+ " `.metadata.orig_elements` on that chunk.",
40
+ )
41
+ chunk_max_characters: int = Field(
42
+ default=CHUNK_MAX_CHARS_DEFAULT,
43
+ description="Hard maximum chunk length. No chunk will exceed this length. An oversized"
44
+ " element will be divided by text-splitting to fit this window.",
45
+ )
46
+ chunk_multipage_sections: bool = Field(
47
+ default=CHUNK_MULTI_PAGE_DEFAULT,
48
+ description="Ignore page boundaries when chunking such that elements from two different"
49
+ " pages can appear in the same chunk. Only operative for 'by_title'"
50
+ " chunking-strategy.",
51
+ )
52
+ chunk_new_after_n_chars: Optional[int] = Field(
53
+ default=None,
54
+ description="Soft-maximum chunk length. Another element will not be added to a chunk of"
55
+ " this length even when it would fit without exceeding the hard-maximum"
56
+ " length.",
57
+ )
58
+ chunk_overlap: Optional[int] = Field(
59
+ default=None,
60
+ description="Prefix chunk text with last overlap=N characters of prior chunk. Only"
61
+ " applies to oversized chunks divided by text-splitting. To apply overlap to"
62
+ " non-oversized chunks use the --overlap-all option.",
63
+ )
64
+ chunk_overlap_all: Optional[bool] = Field(
65
+ default=None,
66
+ description="Apply overlap to chunks formed from whole elements as well as those formed"
67
+ " by text-splitting oversized elements. Overlap length is take from --overlap"
68
+ " option value.",
69
+ )
29
70
 
30
71
  def to_chunking_kwargs(self) -> dict[str, Any]:
31
72
  return {
@@ -47,10 +88,14 @@ class Chunker(BaseProcess, ABC):
47
88
  def is_async(self) -> bool:
48
89
  return self.config.chunk_by_api
49
90
 
50
- def run(self, elements_filepath: Path, **kwargs: Any) -> list[Element]:
91
+ @requires_dependencies(dependencies=["unstructured"])
92
+ def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
93
+ from unstructured.chunking import dispatch
94
+ from unstructured.staging.base import elements_from_json
95
+
51
96
  elements = elements_from_json(filename=str(elements_filepath))
52
97
  if not elements:
53
- return elements
98
+ return [e.to_dict() for e in elements]
54
99
  local_chunking_strategies = ("basic", "by_title")
55
100
  if self.config.chunking_strategy not in local_chunking_strategies:
56
101
  logger.warning(
@@ -58,17 +103,19 @@ class Chunker(BaseProcess, ABC):
58
103
  self.config.chunking_strategy, ", ".join(local_chunking_strategies)
59
104
  )
60
105
  )
61
- return elements
106
+ return [e.to_dict() for e in elements]
62
107
  chunked_elements = dispatch.chunk(elements=elements, **self.config.to_chunking_kwargs())
63
- assign_and_map_hash_ids(chunked_elements)
64
- return chunked_elements
108
+ chunked_elements_dicts = [e.to_dict() for e in chunked_elements]
109
+ chunked_elements_dicts = assign_and_map_hash_ids(elements=chunked_elements_dicts)
110
+ return chunked_elements_dicts
65
111
 
66
- async def run_async(self, elements_filepath: Path, **kwargs: Any) -> list[Element]:
112
+ @requires_dependencies(dependencies=["unstructured_client"], extras="remote")
113
+ async def run_async(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
67
114
  from unstructured_client import UnstructuredClient
68
115
  from unstructured_client.models.shared import Files, PartitionParameters
69
116
 
70
117
  client = UnstructuredClient(
71
- api_key_auth=self.config.chunk_api_key,
118
+ api_key_auth=self.config.chunk_api_key.get_secret_value(),
72
119
  server_url=self.config.chunking_endpoint,
73
120
  )
74
121
  partition_request = self.config.to_chunking_kwargs()
@@ -89,9 +136,8 @@ class Chunker(BaseProcess, ABC):
89
136
  file_name=str(elements_filepath.resolve()),
90
137
  )
91
138
  filtered_partition_request["files"] = files
92
- partition_params = PartitionParameters(**filtered_partition_request)
139
+ partition_params = PartitionParameters(**filtered_partition_request)
93
140
  resp = client.general.partition(partition_params)
94
- elements_raw = resp.elements or []
95
- elements = dict_to_elements(elements_raw)
96
- assign_and_map_hash_ids(elements)
141
+ elements = resp.elements or []
142
+ elements = assign_and_map_hash_ids(elements=elements)
97
143
  return elements
@@ -1,3 +1,4 @@
1
+ from abc import ABC
1
2
  from dataclasses import dataclass
2
3
  from typing import Optional, Type, TypeVar
3
4
 
@@ -25,7 +26,12 @@ UploaderT = TypeVar("UploaderT", bound=Uploader)
25
26
 
26
27
 
27
28
  @dataclass
28
- class SourceRegistryEntry:
29
+ class RegistryEntry(ABC):
30
+ pass
31
+
32
+
33
+ @dataclass
34
+ class SourceRegistryEntry(RegistryEntry):
29
35
  indexer: Type[IndexerT]
30
36
  downloader: Type[DownloaderT]
31
37
 
@@ -44,7 +50,7 @@ def add_source_entry(source_type: str, entry: SourceRegistryEntry):
44
50
 
45
51
 
46
52
  @dataclass
47
- class DestinationRegistryEntry:
53
+ class DestinationRegistryEntry(RegistryEntry):
48
54
  uploader: Type[UploaderT]
49
55
  upload_stager: Optional[Type[UploadStagerT]] = None
50
56
 
@@ -6,16 +6,22 @@ from unstructured_ingest.v2.processes.connector_registry import (
6
6
  add_source_entry,
7
7
  )
8
8
 
9
- from .astra import CONNECTOR_TYPE as ASTRA_CONNECTOR_TYPE
10
- from .astra import astra_destination_entry
9
+ from .astradb import CONNECTOR_TYPE as ASTRA_DB_CONNECTOR_TYPE
10
+ from .astradb import astra_db_destination_entry
11
+ from .azure_cognitive_search import CONNECTOR_TYPE as AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE
12
+ from .azure_cognitive_search import azure_cognitive_search_destination_entry
11
13
  from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
12
14
  from .chroma import chroma_destination_entry
15
+ from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
16
+ from .couchbase import couchbase_destination_entry, couchbase_source_entry
13
17
  from .databricks_volumes import CONNECTOR_TYPE as DATABRICKS_VOLUMES_CONNECTOR_TYPE
14
18
  from .databricks_volumes import databricks_volumes_destination_entry
15
19
  from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
16
20
  from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
17
21
  from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
18
22
  from .google_drive import google_drive_source_entry
23
+ from .kdbai import CONNECTOR_TYPE as KDBAI_CONNECTOR_TYPE
24
+ from .kdbai import kdbai_destination_entry
19
25
  from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE
20
26
  from .local import local_destination_entry, local_source_entry
21
27
  from .milvus import CONNECTOR_TYPE as MILVUS_CONNECTOR_TYPE
@@ -39,10 +45,13 @@ from .sql import sql_destination_entry
39
45
  from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
40
46
  from .weaviate import weaviate_destination_entry
41
47
 
42
- add_destination_entry(destination_type=ASTRA_CONNECTOR_TYPE, entry=astra_destination_entry)
48
+ add_destination_entry(destination_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_destination_entry)
43
49
 
44
50
  add_destination_entry(destination_type=CHROMA_CONNECTOR_TYPE, entry=chroma_destination_entry)
45
51
 
52
+ add_source_entry(source_type=COUCHBASE_CONNECTOR_TYPE, entry=couchbase_source_entry)
53
+ add_destination_entry(destination_type=COUCHBASE_CONNECTOR_TYPE, entry=couchbase_destination_entry)
54
+
46
55
  add_source_entry(source_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_source_entry)
47
56
  add_destination_entry(
48
57
  destination_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_destination_entry
@@ -77,3 +86,9 @@ add_destination_entry(
77
86
  destination_type=SINGLESTORE_CONNECTOR_TYPE, entry=singlestore_destination_entry
78
87
  )
79
88
  add_destination_entry(destination_type=MILVUS_CONNECTOR_TYPE, entry=milvus_destination_entry)
89
+ add_destination_entry(
90
+ destination_type=AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE,
91
+ entry=azure_cognitive_search_destination_entry,
92
+ )
93
+
94
+ add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destination_entry)