unstructured-ingest 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (125) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/cli.py +6 -1
  3. unstructured_ingest/cli/cmds/__init__.py +4 -4
  4. unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
  5. unstructured_ingest/cli/interfaces.py +13 -6
  6. unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
  7. unstructured_ingest/connector/biomed.py +12 -5
  8. unstructured_ingest/connector/confluence.py +3 -3
  9. unstructured_ingest/connector/github.py +3 -2
  10. unstructured_ingest/connector/google_drive.py +1 -2
  11. unstructured_ingest/connector/mongodb.py +1 -2
  12. unstructured_ingest/connector/notion/client.py +31 -16
  13. unstructured_ingest/connector/notion/connector.py +3 -2
  14. unstructured_ingest/connector/registry.py +2 -2
  15. unstructured_ingest/connector/vectara.py +7 -2
  16. unstructured_ingest/interfaces.py +13 -9
  17. unstructured_ingest/pipeline/interfaces.py +8 -3
  18. unstructured_ingest/pipeline/reformat/chunking.py +13 -9
  19. unstructured_ingest/pipeline/reformat/embedding.py +3 -3
  20. unstructured_ingest/runner/__init__.py +2 -2
  21. unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
  22. unstructured_ingest/runner/writers/__init__.py +2 -2
  23. unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
  24. unstructured_ingest/utils/chunking.py +45 -0
  25. unstructured_ingest/utils/dep_check.py +1 -1
  26. unstructured_ingest/utils/google_filetype.py +9 -0
  27. unstructured_ingest/v2/cli/base/cmd.py +57 -13
  28. unstructured_ingest/v2/cli/base/dest.py +21 -12
  29. unstructured_ingest/v2/cli/base/src.py +35 -23
  30. unstructured_ingest/v2/cli/cmds.py +14 -0
  31. unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
  32. unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
  33. unstructured_ingest/v2/interfaces/connector.py +5 -7
  34. unstructured_ingest/v2/interfaces/downloader.py +8 -5
  35. unstructured_ingest/v2/interfaces/file_data.py +8 -2
  36. unstructured_ingest/v2/interfaces/indexer.py +3 -4
  37. unstructured_ingest/v2/interfaces/processor.py +10 -10
  38. unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
  39. unstructured_ingest/v2/interfaces/uploader.py +3 -3
  40. unstructured_ingest/v2/pipeline/pipeline.py +9 -6
  41. unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
  42. unstructured_ingest/v2/pipeline/steps/download.py +13 -11
  43. unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
  44. unstructured_ingest/v2/pipeline/steps/filter.py +1 -6
  45. unstructured_ingest/v2/pipeline/steps/index.py +14 -10
  46. unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
  47. unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
  48. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
  49. unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
  50. unstructured_ingest/v2/processes/__init__.py +18 -0
  51. unstructured_ingest/v2/processes/chunker.py +74 -28
  52. unstructured_ingest/v2/processes/connector_registry.py +8 -2
  53. unstructured_ingest/v2/processes/connectors/__init__.py +18 -3
  54. unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +46 -39
  55. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +30 -27
  56. unstructured_ingest/v2/processes/connectors/chroma.py +30 -21
  57. unstructured_ingest/v2/processes/connectors/couchbase.py +333 -0
  58. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +87 -32
  59. unstructured_ingest/v2/processes/connectors/elasticsearch.py +70 -45
  60. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +39 -16
  61. unstructured_ingest/v2/processes/connectors/fsspec/box.py +15 -13
  62. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +10 -11
  63. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +20 -34
  64. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +38 -13
  65. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +31 -17
  66. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -28
  67. unstructured_ingest/v2/processes/connectors/google_drive.py +40 -34
  68. unstructured_ingest/v2/processes/connectors/kdbai.py +170 -0
  69. unstructured_ingest/v2/processes/connectors/local.py +27 -16
  70. unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
  71. unstructured_ingest/v2/processes/connectors/mongodb.py +22 -18
  72. unstructured_ingest/v2/processes/connectors/onedrive.py +17 -14
  73. unstructured_ingest/v2/processes/connectors/opensearch.py +66 -56
  74. unstructured_ingest/v2/processes/connectors/pinecone.py +22 -21
  75. unstructured_ingest/v2/processes/connectors/salesforce.py +26 -18
  76. unstructured_ingest/v2/processes/connectors/sharepoint.py +51 -26
  77. unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
  78. unstructured_ingest/v2/processes/connectors/sql.py +29 -31
  79. unstructured_ingest/v2/processes/connectors/weaviate.py +22 -13
  80. unstructured_ingest/v2/processes/embedder.py +106 -47
  81. unstructured_ingest/v2/processes/filter.py +11 -5
  82. unstructured_ingest/v2/processes/partitioner.py +79 -33
  83. unstructured_ingest/v2/processes/uncompress.py +3 -3
  84. unstructured_ingest/v2/utils.py +45 -0
  85. unstructured_ingest-0.0.5.dist-info/LICENSE.md +201 -0
  86. unstructured_ingest-0.0.5.dist-info/METADATA +574 -0
  87. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/RECORD +91 -116
  88. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/WHEEL +1 -1
  89. unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
  90. unstructured_ingest/v2/cli/cmds/astra.py +0 -85
  91. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
  92. unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
  93. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
  94. unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
  95. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
  96. unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
  97. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
  98. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -69
  99. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
  100. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
  101. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
  102. unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
  103. unstructured_ingest/v2/cli/cmds/local.py +0 -52
  104. unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
  105. unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
  106. unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
  107. unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
  108. unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
  109. unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
  110. unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
  111. unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
  112. unstructured_ingest/v2/cli/cmds/sql.py +0 -84
  113. unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
  114. unstructured_ingest/v2/cli/configs/__init__.py +0 -13
  115. unstructured_ingest/v2/cli/configs/chunk.py +0 -89
  116. unstructured_ingest/v2/cli/configs/embed.py +0 -74
  117. unstructured_ingest/v2/cli/configs/filter.py +0 -28
  118. unstructured_ingest/v2/cli/configs/partition.py +0 -99
  119. unstructured_ingest/v2/cli/configs/processor.py +0 -88
  120. unstructured_ingest/v2/cli/interfaces.py +0 -27
  121. unstructured_ingest/v2/pipeline/utils.py +0 -15
  122. unstructured_ingest-0.0.3.dist-info/METADATA +0 -175
  123. /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  124. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/entry_points.txt +0 -0
  125. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/top_level.txt +0 -0
@@ -1 +1 @@
1
- __version__ = "0.0.3" # pragma: no cover
1
+ __version__ = "0.0.5" # pragma: no cover
@@ -1,16 +1,21 @@
1
+ from typing import TYPE_CHECKING
2
+
1
3
  import click
2
4
 
3
5
  from unstructured_ingest.cli import dest, src
4
6
  from unstructured_ingest.v2.cli.cmds import dest as dest_v2
5
7
  from unstructured_ingest.v2.cli.cmds import src as src_v2
6
8
 
9
+ if TYPE_CHECKING:
10
+ from click import Command
11
+
7
12
 
8
13
  @click.group()
9
14
  def ingest():
10
15
  pass
11
16
 
12
17
 
13
- def get_cmd() -> click.Command:
18
+ def get_cmd() -> "Command":
14
19
  """Construct and return a Click command object representing the main command for the CLI.
15
20
 
16
21
  This function adds all dest_subcommand(s) to each src_subcommand, and adds all of those
@@ -7,8 +7,8 @@ from unstructured_ingest.cli.base.src import BaseSrcCmd
7
7
  from unstructured_ingest.cli.cmds.fsspec.sftp import get_base_src_cmd as sftp_base_src_cmd
8
8
 
9
9
  from .airtable import get_base_src_cmd as airtable_base_src_cmd
10
- from .astra import get_base_dest_cmd as astra_base_dest_cmd
11
- from .astra import get_base_src_cmd as astra_base_src_cmd
10
+ from .astradb import get_base_dest_cmd as astradb_base_dest_cmd
11
+ from .astradb import get_base_src_cmd as astradb_base_src_cmd
12
12
  from .azure_cognitive_search import get_base_dest_cmd as azure_cognitive_search_base_dest_cmd
13
13
  from .biomed import get_base_src_cmd as biomed_base_src_cmd
14
14
  from .chroma import get_base_dest_cmd as chroma_base_dest_cmd
@@ -63,7 +63,7 @@ if t.TYPE_CHECKING:
63
63
 
64
64
  base_src_cmd_fns: t.List[t.Callable[[], BaseSrcCmd]] = [
65
65
  airtable_base_src_cmd,
66
- astra_base_src_cmd,
66
+ astradb_base_src_cmd,
67
67
  azure_base_src_cmd,
68
68
  biomed_base_src_cmd,
69
69
  box_base_src_cmd,
@@ -106,7 +106,7 @@ if src_duplicates:
106
106
  )
107
107
 
108
108
  base_dest_cmd_fns: t.List[t.Callable[[], "BaseDestCmd"]] = [
109
- astra_base_dest_cmd,
109
+ astradb_base_dest_cmd,
110
110
  azure_base_dest_cmd,
111
111
  box_base_dest_cmd,
112
112
  chroma_base_dest_cmd,
@@ -4,11 +4,11 @@ from dataclasses import dataclass
4
4
  import click
5
5
 
6
6
  from unstructured_ingest.cli.interfaces import CliConfig, Dict
7
- from unstructured_ingest.connector.astra import AstraWriteConfig, SimpleAstraConfig
7
+ from unstructured_ingest.connector.astradb import AstraDBWriteConfig, SimpleAstraDBConfig
8
8
 
9
9
 
10
10
  @dataclass
11
- class AstraCliConfig(SimpleAstraConfig, CliConfig):
11
+ class AstraDBCliConfig(SimpleAstraDBConfig, CliConfig):
12
12
  @staticmethod
13
13
  def get_cli_options() -> t.List[click.Option]:
14
14
  options = [
@@ -48,7 +48,7 @@ class AstraCliConfig(SimpleAstraConfig, CliConfig):
48
48
 
49
49
 
50
50
  @dataclass
51
- class AstraCliWriteConfig(AstraWriteConfig, CliConfig):
51
+ class AstraDBCliWriteConfig(AstraDBWriteConfig, CliConfig):
52
52
  @staticmethod
53
53
  def get_cli_options() -> t.List[click.Option]:
54
54
  options = [
@@ -81,8 +81,8 @@ def get_base_src_cmd():
81
81
  from unstructured_ingest.cli.base.src import BaseSrcCmd
82
82
 
83
83
  cmd_cls = BaseSrcCmd(
84
- cmd_name="astra",
85
- cli_config=AstraCliConfig,
84
+ cmd_name="astradb",
85
+ cli_config=AstraDBCliConfig,
86
86
  )
87
87
  return cmd_cls
88
88
 
@@ -91,9 +91,9 @@ def get_base_dest_cmd():
91
91
  from unstructured_ingest.cli.base.dest import BaseDestCmd
92
92
 
93
93
  cmd_cls = BaseDestCmd(
94
- cmd_name="astra",
95
- cli_config=AstraCliConfig,
96
- additional_cli_options=[AstraCliWriteConfig],
97
- write_config=AstraWriteConfig,
94
+ cmd_name="astradb",
95
+ cli_config=AstraDBCliConfig,
96
+ additional_cli_options=[AstraDBCliWriteConfig],
97
+ write_config=AstraDBWriteConfig,
98
98
  )
99
99
  return cmd_cls
@@ -11,7 +11,6 @@ from pathlib import Path
11
11
  import click
12
12
  from dataclasses_json.core import Json
13
13
  from typing_extensions import Self
14
- from unstructured.chunking import CHUNK_MAX_CHARS_DEFAULT, CHUNK_MULTI_PAGE_DEFAULT
15
14
 
16
15
  from unstructured_ingest.interfaces import (
17
16
  BaseConfig,
@@ -25,6 +24,9 @@ from unstructured_ingest.interfaces import (
25
24
  RetryStrategyConfig,
26
25
  )
27
26
 
27
+ CHUNK_MAX_CHARS_DEFAULT: int = 500
28
+ CHUNK_MULTI_PAGE_DEFAULT: bool = True
29
+
28
30
 
29
31
  class Dict(click.ParamType):
30
32
  name = "dict"
@@ -412,14 +414,19 @@ class CliFilesStorageConfig(FileStorageConfig, CliMixin):
412
414
  class CliEmbeddingConfig(EmbeddingConfig, CliMixin):
413
415
  @staticmethod
414
416
  def get_cli_options() -> t.List[click.Option]:
415
- from unstructured.embed import EMBEDDING_PROVIDER_TO_CLASS_MAP
416
-
417
+ embed_providers = [
418
+ "langchain-openai",
419
+ "langchain-huggingface",
420
+ "langchain-aws-bedrock",
421
+ "langchain-vertexai",
422
+ "langchain-voyageai",
423
+ "octoai",
424
+ ]
417
425
  options = [
418
426
  click.Option(
419
427
  ["--embedding-provider"],
420
- help="Type of the embedding class to be used. Can be one of: "
421
- f"{list(EMBEDDING_PROVIDER_TO_CLASS_MAP)}",
422
- type=click.Choice(list(EMBEDDING_PROVIDER_TO_CLASS_MAP)),
428
+ help="Type of the embedding class to be used.",
429
+ type=click.Choice(embed_providers),
423
430
  ),
424
431
  click.Option(
425
432
  ["--embedding-api-key"],
@@ -30,23 +30,23 @@ NON_INDEXED_FIELDS = ["metadata._node_content", "content"]
30
30
 
31
31
 
32
32
  @dataclass
33
- class AstraAccessConfig(AccessConfig):
33
+ class AstraDBAccessConfig(AccessConfig):
34
34
  token: str = enhanced_field(sensitive=True)
35
35
  api_endpoint: str = enhanced_field(sensitive=True)
36
36
 
37
37
 
38
38
  @dataclass
39
- class SimpleAstraConfig(BaseConnectorConfig):
40
- access_config: AstraAccessConfig
39
+ class SimpleAstraDBConfig(BaseConnectorConfig):
40
+ access_config: AstraDBAccessConfig
41
41
  collection_name: str
42
42
  namespace: t.Optional[str] = None
43
43
 
44
44
 
45
45
  @dataclass
46
- class AstraIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
47
- connector_config: SimpleAstraConfig
46
+ class AstraDBIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
47
+ connector_config: SimpleAstraDBConfig
48
48
  metadata: t.Dict[str, str] = field(default_factory=dict)
49
- registry_name: str = "astra"
49
+ registry_name: str = "astradb"
50
50
 
51
51
  @property
52
52
  def filename(self):
@@ -75,7 +75,7 @@ class AstraIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
75
75
  )
76
76
 
77
77
  @SourceConnectionError.wrap
78
- @requires_dependencies(["astrapy"], extras="astra")
78
+ @requires_dependencies(["astrapy"], extras="astradb")
79
79
  @BaseSingleIngestDoc.skip_if_file_exists
80
80
  def get_file(self):
81
81
  self.filename.parent.mkdir(parents=True, exist_ok=True)
@@ -89,19 +89,19 @@ class AstraIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
89
89
 
90
90
 
91
91
  @dataclass
92
- class AstraSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
93
- connector_config: SimpleAstraConfig
92
+ class AstraDBSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
93
+ connector_config: SimpleAstraDBConfig
94
94
  _astra_db: t.Optional["AstraDB"] = field(init=False, default=None)
95
95
  _astra_db_collection: t.Optional["AstraDBCollection"] = field(init=False, default=None)
96
96
 
97
97
  @property
98
- @requires_dependencies(["astrapy"], extras="astra")
98
+ @requires_dependencies(["astrapy"], extras="astradb")
99
99
  def astra_db_collection(self) -> "AstraDBCollection":
100
100
  if self._astra_db_collection is None:
101
101
  from astrapy.db import AstraDB
102
102
 
103
103
  # Build the Astra DB object.
104
- # caller_name/version for AstraDB tracking
104
+ # caller_name/version for Astra DB tracking
105
105
  self._astra_db = AstraDB(
106
106
  api_endpoint=self.connector_config.access_config.api_endpoint,
107
107
  token=self.connector_config.access_config.token,
@@ -116,12 +116,12 @@ class AstraSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
116
116
  )
117
117
  return self._astra_db_collection # type: ignore
118
118
 
119
- @requires_dependencies(["astrapy"], extras="astra")
119
+ @requires_dependencies(["astrapy"], extras="astradb")
120
120
  @SourceConnectionError.wrap # type: ignore
121
121
  def initialize(self):
122
122
  _ = self.astra_db_collection
123
123
 
124
- @requires_dependencies(["astrapy"], extras="astra")
124
+ @requires_dependencies(["astrapy"], extras="astradb")
125
125
  def check_connection(self):
126
126
  try:
127
127
  _ = self.astra_db_collection
@@ -129,14 +129,14 @@ class AstraSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
129
129
  logger.error(f"Failed to validate connection {e}", exc_info=True)
130
130
  raise SourceConnectionError(f"failed to validate connection: {e}")
131
131
 
132
- @requires_dependencies(["astrapy"], extras="astra")
132
+ @requires_dependencies(["astrapy"], extras="astradb")
133
133
  def get_ingest_docs(self): # type: ignore
134
134
  # Perform the find operation
135
- astra_docs = list(self.astra_db_collection.paginated_find())
135
+ astra_db_docs = list(self.astra_db_collection.paginated_find())
136
136
 
137
137
  doc_list = []
138
- for record in astra_docs:
139
- doc = AstraIngestDoc(
138
+ for record in astra_db_docs:
139
+ doc = AstraDBIngestDoc(
140
140
  connector_config=self.connector_config,
141
141
  processor_config=self.processor_config,
142
142
  read_config=self.read_config,
@@ -151,16 +151,16 @@ class AstraSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
151
151
 
152
152
 
153
153
  @dataclass
154
- class AstraWriteConfig(WriteConfig):
154
+ class AstraDBWriteConfig(WriteConfig):
155
155
  embedding_dimension: int
156
156
  requested_indexing_policy: t.Optional[t.Dict[str, t.Any]] = None
157
157
  batch_size: int = 20
158
158
 
159
159
 
160
160
  @dataclass
161
- class AstraDestinationConnector(BaseDestinationConnector):
162
- write_config: AstraWriteConfig
163
- connector_config: SimpleAstraConfig
161
+ class AstraDBDestinationConnector(BaseDestinationConnector):
162
+ write_config: AstraDBWriteConfig
163
+ connector_config: SimpleAstraDBConfig
164
164
  _astra_db: t.Optional["AstraDB"] = field(init=False, default=None)
165
165
  _astra_db_collection: t.Optional["AstraDBCollection"] = field(init=False, default=None)
166
166
 
@@ -179,7 +179,7 @@ class AstraDestinationConnector(BaseDestinationConnector):
179
179
  return _asdict(self_cp, **kwargs)
180
180
 
181
181
  @property
182
- @requires_dependencies(["astrapy"], extras="astra")
182
+ @requires_dependencies(["astrapy"], extras="astradb")
183
183
  def astra_db_collection(self) -> "AstraDBCollection":
184
184
  if self._astra_db_collection is None:
185
185
  from astrapy.db import AstraDB
@@ -187,11 +187,11 @@ class AstraDestinationConnector(BaseDestinationConnector):
187
187
  collection_name = self.connector_config.collection_name
188
188
  embedding_dimension = self.write_config.embedding_dimension
189
189
 
190
- # If the user has requested an indexing policy, pass it to the AstraDB
190
+ # If the user has requested an indexing policy, pass it to the Astra DB
191
191
  requested_indexing_policy = self.write_config.requested_indexing_policy
192
192
  options = {"indexing": requested_indexing_policy} if requested_indexing_policy else None
193
193
 
194
- # caller_name/version for AstraDB tracking
194
+ # caller_name/version for Astra DB tracking
195
195
  self._astra_db = AstraDB(
196
196
  api_endpoint=self.connector_config.access_config.api_endpoint,
197
197
  token=self.connector_config.access_config.token,
@@ -208,12 +208,12 @@ class AstraDestinationConnector(BaseDestinationConnector):
208
208
  )
209
209
  return self._astra_db_collection
210
210
 
211
- @requires_dependencies(["astrapy"], extras="astra")
211
+ @requires_dependencies(["astrapy"], extras="astradb")
212
212
  @DestinationConnectionError.wrap
213
213
  def initialize(self):
214
214
  _ = self.astra_db_collection
215
215
 
216
- @requires_dependencies(["astrapy"], extras="astra")
216
+ @requires_dependencies(["astrapy"], extras="astradb")
217
217
  def check_connection(self):
218
218
  try:
219
219
  _ = self.astra_db_collection
@@ -222,11 +222,11 @@ class AstraDestinationConnector(BaseDestinationConnector):
222
222
  raise DestinationConnectionError(f"failed to validate connection: {e}")
223
223
 
224
224
  def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
225
- logger.info(f"Inserting / updating {len(elements_dict)} documents to Astra.")
225
+ logger.info(f"Inserting / updating {len(elements_dict)} documents to Astra DB.")
226
226
 
227
- astra_batch_size = self.write_config.batch_size
227
+ astra_db_batch_size = self.write_config.batch_size
228
228
 
229
- for batch in batch_generator(elements_dict, astra_batch_size):
229
+ for batch in batch_generator(elements_dict, astra_db_batch_size):
230
230
  self._astra_db_collection.insert_many(batch)
231
231
 
232
232
  def normalize_dict(self, element_dict: dict) -> dict:
@@ -5,9 +5,6 @@ from dataclasses import dataclass
5
5
  from ftplib import FTP, error_perm
6
6
  from pathlib import Path
7
7
 
8
- import requests
9
- from requests.adapters import HTTPAdapter
10
-
11
8
  from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
12
9
  from unstructured_ingest.interfaces import (
13
10
  BaseConnectorConfig,
@@ -20,6 +17,10 @@ from unstructured_ingest.logger import logger
20
17
  from unstructured_ingest.utils.data_prep import (
21
18
  validate_date_args,
22
19
  )
20
+ from unstructured_ingest.utils.dep_check import requires_dependencies
21
+
22
+ if t.TYPE_CHECKING:
23
+ from requests import Response, Session
23
24
 
24
25
  DOMAIN = "ftp.ncbi.nlm.nih.gov"
25
26
  FTP_DOMAIN = f"ftp://{DOMAIN}"
@@ -165,8 +166,11 @@ class BiomedSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
165
166
 
166
167
  return endpoint_url
167
168
 
169
+ @requires_dependencies(["requests"], extras="biomed")
168
170
  def _list_objects_api(self) -> t.List[BiomedFileMeta]:
169
171
  from bs4 import BeautifulSoup
172
+ from requests import Session
173
+ from requests.adapters import HTTPAdapter
170
174
 
171
175
  def urls_to_metadata(urls):
172
176
  files = []
@@ -193,7 +197,7 @@ class BiomedSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
193
197
  endpoint_url = self.get_base_endpoints_url()
194
198
 
195
199
  while endpoint_url:
196
- session = requests.Session()
200
+ session = Session()
197
201
  adapter = HTTPAdapter()
198
202
  session.mount("http://", adapter)
199
203
  session.mount("https://", adapter)
@@ -213,7 +217,7 @@ class BiomedSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
213
217
  return files
214
218
 
215
219
  @SourceConnectionNetworkError.wrap
216
- def _get_request(self, session: requests.Session, endpoint_url: str) -> requests.Response:
220
+ def _get_request(self, session: "Session", endpoint_url: str) -> "Response":
217
221
  return session.get(endpoint_url, timeout=self.connector_config.max_request_time)
218
222
 
219
223
  def _list_objects(self) -> t.List[BiomedFileMeta]:
@@ -293,7 +297,10 @@ class BiomedSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
293
297
  def initialize(self):
294
298
  pass
295
299
 
300
+ @requires_dependencies(["requests"], extras="biomed")
296
301
  def check_connection(self):
302
+ import requests
303
+
297
304
  resp = requests.head(self.get_base_endpoints_url())
298
305
  try:
299
306
  resp.raise_for_status()
@@ -4,8 +4,6 @@ from dataclasses import dataclass, field
4
4
  from datetime import datetime
5
5
  from pathlib import Path
6
6
 
7
- import requests
8
-
9
7
  from unstructured_ingest.enhanced_dataclass import enhanced_field
10
8
  from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
11
9
  from unstructured_ingest.interfaces import (
@@ -208,8 +206,10 @@ class ConfluenceSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector
208
206
  )
209
207
  return self._confluence
210
208
 
211
- @requires_dependencies(["atlassian"], extras="Confluence")
209
+ @requires_dependencies(["atlassian", "requests"], extras="Confluence")
212
210
  def check_connection(self):
211
+ import requests
212
+
213
213
  url = "rest/api/space"
214
214
  try:
215
215
  self.confluence.request(method="HEAD", path=url)
@@ -3,8 +3,6 @@ from dataclasses import dataclass
3
3
  from datetime import datetime
4
4
  from urllib.parse import urlparse
5
5
 
6
- import requests
7
-
8
6
  from unstructured_ingest.connector.git import (
9
7
  GitIngestDoc,
10
8
  GitSourceConnector,
@@ -71,7 +69,10 @@ class GitHubIngestDoc(GitIngestDoc):
71
69
  return content_file
72
70
 
73
71
  @SourceConnectionNetworkError.wrap
72
+ @requires_dependencies(["requests"], extras="github")
74
73
  def _fetch_content(self, content_file):
74
+ import requests
75
+
75
76
  contents = b""
76
77
  if (
77
78
  not content_file.content # type: ignore
@@ -7,8 +7,6 @@ from datetime import datetime
7
7
  from mimetypes import guess_extension
8
8
  from pathlib import Path
9
9
 
10
- from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
11
-
12
10
  from unstructured_ingest.enhanced_dataclass import enhanced_field
13
11
  from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
14
12
  from unstructured_ingest.interfaces import (
@@ -25,6 +23,7 @@ from unstructured_ingest.interfaces import (
25
23
  )
26
24
  from unstructured_ingest.logger import logger
27
25
  from unstructured_ingest.utils.dep_check import requires_dependencies
26
+ from unstructured_ingest.utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
28
27
  from unstructured_ingest.utils.string_and_date_utils import json_to_dict
29
28
 
30
29
  if t.TYPE_CHECKING:
@@ -3,8 +3,7 @@ import typing as t
3
3
  from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
5
 
6
- from unstructured.__version__ import __version__ as unstructured_version
7
-
6
+ from unstructured_ingest.__version__ import __version__ as unstructured_version
8
7
  from unstructured_ingest.enhanced_dataclass import enhanced_field
9
8
  from unstructured_ingest.enhanced_dataclass.core import _asdict
10
9
  from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError, WriteError
@@ -1,7 +1,5 @@
1
1
  from typing import Any, Generator, List, Optional, Tuple
2
2
 
3
- import backoff
4
- import httpx
5
3
  import notion_client.errors
6
4
  from notion_client import Client as NotionClient
7
5
  from notion_client.api_endpoints import BlocksChildrenEndpoint as NotionBlocksChildrenEndpoint
@@ -19,25 +17,36 @@ from unstructured_ingest.connector.notion.types.database_properties import (
19
17
  from unstructured_ingest.connector.notion.types.page import Page
20
18
  from unstructured_ingest.ingest_backoff import RetryHandler
21
19
  from unstructured_ingest.interfaces import RetryStrategyConfig
20
+ from unstructured_ingest.utils.dep_check import requires_dependencies
22
21
 
23
- retryable_exceptions = (
24
- httpx.TimeoutException,
25
- httpx.HTTPStatusError,
26
- notion_client.errors.HTTPResponseError,
27
- )
22
+
23
+ @requires_dependencies(["httpx"], extras="notion")
24
+ def _get_retry_strategy(
25
+ endpoint: Endpoint, retry_strategy_config: RetryStrategyConfig
26
+ ) -> RetryHandler:
27
+ import backoff
28
+ import httpx
29
+
30
+ retryable_exceptions = (
31
+ httpx.TimeoutException,
32
+ httpx.HTTPStatusError,
33
+ notion_client.errors.HTTPResponseError,
34
+ )
35
+
36
+ return RetryHandler(
37
+ backoff.expo,
38
+ retryable_exceptions,
39
+ max_time=retry_strategy_config.max_retry_time,
40
+ max_tries=retry_strategy_config.max_retries,
41
+ logger=endpoint.parent.logger,
42
+ start_log_level=endpoint.parent.logger.level,
43
+ backoff_log_level=endpoint.parent.logger.level,
44
+ )
28
45
 
29
46
 
30
47
  def get_retry_handler(endpoint: Endpoint) -> Optional[RetryHandler]:
31
48
  if retry_strategy_config := getattr(endpoint, "retry_strategy_config"):
32
- return RetryHandler(
33
- backoff.expo,
34
- retryable_exceptions,
35
- max_time=retry_strategy_config.max_retry_time,
36
- max_tries=retry_strategy_config.max_retries,
37
- logger=endpoint.parent.logger,
38
- start_log_level=endpoint.parent.logger.level,
39
- backoff_log_level=endpoint.parent.logger.level,
40
- )
49
+ return _get_retry_strategy(endpoint=endpoint, retry_strategy_config=retry_strategy_config)
41
50
  return None
42
51
 
43
52
 
@@ -105,7 +114,10 @@ class DatabasesEndpoint(NotionDatabasesEndpoint):
105
114
  ) # type: ignore
106
115
  return Database.from_dict(data=resp)
107
116
 
117
+ @requires_dependencies(["httpx"], extras="notion")
108
118
  def retrieve_status(self, database_id: str, **kwargs) -> int:
119
+ import httpx
120
+
109
121
  request = self.parent._build_request(
110
122
  method="HEAD",
111
123
  path=f"databases/{database_id}",
@@ -203,7 +215,10 @@ class PagesEndpoint(NotionPagesEndpoint):
203
215
  ) # type: ignore
204
216
  return Page.from_dict(data=resp)
205
217
 
218
+ @requires_dependencies(["httpx"], extras="notion")
206
219
  def retrieve_status(self, page_id: str, **kwargs) -> int:
220
+ import httpx
221
+
207
222
  request = self.parent._build_request(
208
223
  method="HEAD",
209
224
  path=f"pages/{page_id}",
@@ -3,8 +3,6 @@ from dataclasses import dataclass, field
3
3
  from pathlib import Path
4
4
  from uuid import UUID
5
5
 
6
- import httpx
7
-
8
6
  from unstructured_ingest.enhanced_dataclass import enhanced_field
9
7
  from unstructured_ingest.error import SourceConnectionError
10
8
  from unstructured_ingest.interfaces import (
@@ -316,7 +314,10 @@ class NotionSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
316
314
  retry_strategy_config=self.retry_strategy_config,
317
315
  )
318
316
 
317
+ @requires_dependencies(["httpx"], extras="notion")
319
318
  def check_connection(self):
319
+ import httpx
320
+
320
321
  try:
321
322
  request = self.client._build_request("HEAD", "users")
322
323
  response = self.client.client.send(request)
@@ -2,7 +2,7 @@ import json
2
2
  from typing import Dict, Type, cast
3
3
 
4
4
  from unstructured_ingest.connector.airtable import AirtableIngestDoc
5
- from unstructured_ingest.connector.astra import AstraIngestDoc
5
+ from unstructured_ingest.connector.astradb import AstraDBIngestDoc
6
6
  from unstructured_ingest.connector.biomed import BiomedIngestDoc
7
7
  from unstructured_ingest.connector.confluence import ConfluenceIngestDoc
8
8
  from unstructured_ingest.connector.delta_table import DeltaTableIngestDoc
@@ -46,7 +46,7 @@ from unstructured_ingest.interfaces import BaseIngestDoc
46
46
 
47
47
  INGEST_DOC_NAME_TO_CLASS: Dict[str, Type[EnhancedDataClassJsonMixin]] = {
48
48
  "airtable": AirtableIngestDoc,
49
- "astra": AstraIngestDoc,
49
+ "astradb": AstraDBIngestDoc,
50
50
  "azure": AzureBlobStorageIngestDoc,
51
51
  "biomed": BiomedIngestDoc,
52
52
  "box": BoxIngestDoc,
@@ -4,8 +4,6 @@ import typing as t
4
4
  import uuid
5
5
  from dataclasses import dataclass, field
6
6
 
7
- import requests
8
-
9
7
  from unstructured_ingest.enhanced_dataclass import enhanced_field
10
8
  from unstructured_ingest.error import DestinationConnectionError
11
9
  from unstructured_ingest.interfaces import (
@@ -17,6 +15,7 @@ from unstructured_ingest.interfaces import (
17
15
  )
18
16
  from unstructured_ingest.logger import logger
19
17
  from unstructured_ingest.utils.data_prep import flatten_dict
18
+ from unstructured_ingest.utils.dep_check import requires_dependencies
20
19
 
21
20
  BASE_URL = "https://api.vectara.io/v1"
22
21
 
@@ -95,6 +94,7 @@ class VectaraDestinationConnector(BaseDestinationConnector):
95
94
  def initialize(self):
96
95
  self.vectara()
97
96
 
97
+ @requires_dependencies(["requests"], extras="vectara")
98
98
  def _request(
99
99
  self,
100
100
  endpoint: str,
@@ -102,6 +102,8 @@ class VectaraDestinationConnector(BaseDestinationConnector):
102
102
  params: t.Mapping[str, t.Any] = None,
103
103
  data: t.Mapping[str, t.Any] = None,
104
104
  ):
105
+ import requests
106
+
105
107
  url = f"{BASE_URL}/{endpoint}"
106
108
 
107
109
  headers = {
@@ -119,7 +121,10 @@ class VectaraDestinationConnector(BaseDestinationConnector):
119
121
  return response.json()
120
122
 
121
123
  # Get Oauth2 JWT token
124
+ @requires_dependencies(["requests"], extras="vectara")
122
125
  def _get_jwt_token(self):
126
+ import requests
127
+
123
128
  """Connect to the server and get a JWT token."""
124
129
  token_endpoint = self.connector_config.token_url.format(self.connector_config.customer_id)
125
130
  headers = {