unstructured-ingest 0.0.2__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1 +1 @@
1
- __version__ = "0.0.2" # pragma: no cover
1
+ __version__ = "0.0.3" # pragma: no cover
@@ -17,7 +17,3 @@ class BaseProcess(ABC):
17
17
 
18
18
  async def run_async(self, **kwargs: Any) -> Any:
19
19
  return self.run(**kwargs)
20
-
21
- def check_connection(self):
22
- # If the process requires external connections, run a quick check
23
- pass
@@ -75,6 +75,10 @@ class AzureIndexer(FsspecIndexer):
75
75
  index_config: AzureIndexerConfig
76
76
  connector_type: str = CONNECTOR_TYPE
77
77
 
78
+ @requires_dependencies(["adlfs", "fsspec"], extras="azure")
79
+ def precheck(self) -> None:
80
+ super().precheck()
81
+
78
82
  def sterilize_info(self, path) -> dict:
79
83
  info = self.fs.info(path=path)
80
84
  return sterilize_dict(data=info, default=azure_json_serial)
@@ -120,6 +124,10 @@ class AzureUploader(FsspecUploader):
120
124
  def __post_init__(self):
121
125
  super().__post_init__()
122
126
 
127
+ @requires_dependencies(["adlfs", "fsspec"], extras="azure")
128
+ def precheck(self) -> None:
129
+ super().precheck()
130
+
123
131
  @requires_dependencies(["adlfs", "fsspec"], extras="azure")
124
132
  def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
125
133
  return super().run(contents=contents, **kwargs)
@@ -70,6 +70,10 @@ class BoxIndexer(FsspecIndexer):
70
70
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
71
71
  return super().run(**kwargs)
72
72
 
73
+ @requires_dependencies(["boxfs"], extras="box")
74
+ def precheck(self) -> None:
75
+ super().precheck()
76
+
73
77
 
74
78
  @dataclass
75
79
  class BoxDownloaderConfig(FsspecDownloaderConfig):
@@ -107,6 +111,10 @@ class BoxUploader(FsspecUploader):
107
111
  def __post_init__(self):
108
112
  super().__post_init__()
109
113
 
114
+ @requires_dependencies(["boxfs"], extras="box")
115
+ def precheck(self) -> None:
116
+ super().precheck()
117
+
110
118
  @requires_dependencies(["boxfs"], extras="box")
111
119
  def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
112
120
  return super().run(contents=contents, **kwargs)
@@ -57,6 +57,10 @@ class DropboxIndexer(FsspecIndexer):
57
57
  if not self.index_config.path_without_protocol.startswith("/"):
58
58
  self.index_config.path_without_protocol = "/" + self.index_config.path_without_protocol
59
59
 
60
+ @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
61
+ def precheck(self) -> None:
62
+ super().precheck()
63
+
60
64
  @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
61
65
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
62
66
  return super().run(**kwargs)
@@ -106,6 +110,10 @@ class DropboxUploader(FsspecUploader):
106
110
  def __post_init__(self):
107
111
  super().__post_init__()
108
112
 
113
+ @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
114
+ def precheck(self) -> None:
115
+ super().precheck()
116
+
109
117
  @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
110
118
  def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
111
119
  return super().run(contents=contents, **kwargs)
@@ -9,7 +9,11 @@ from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
9
9
  from uuid import NAMESPACE_DNS, uuid5
10
10
 
11
11
  from unstructured_ingest.enhanced_dataclass import enhanced_field
12
- from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
12
+ from unstructured_ingest.error import (
13
+ DestinationConnectionError,
14
+ SourceConnectionError,
15
+ SourceConnectionNetworkError,
16
+ )
13
17
  from unstructured_ingest.v2.interfaces import (
14
18
  AccessConfig,
15
19
  ConnectionConfig,
@@ -300,6 +304,19 @@ class FsspecUploader(Uploader):
300
304
  f"missing 1 required positional argument: 'upload_config'"
301
305
  )
302
306
 
307
+ def precheck(self) -> None:
308
+ from fsspec import get_filesystem_class
309
+
310
+ try:
311
+ fs = get_filesystem_class(self.upload_config.protocol)(
312
+ **self.connection_config.get_access_config(),
313
+ )
314
+ root_dir = self.upload_config.path_without_protocol.split("/")[0]
315
+ fs.ls(path=root_dir, detail=False)
316
+ except Exception as e:
317
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
318
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
319
+
303
320
  def get_upload_path(self, file_data: FileData) -> Path:
304
321
  upload_path = (
305
322
  Path(self.upload_config.path_without_protocol)
@@ -80,6 +80,10 @@ class GcsIndexer(FsspecIndexer):
80
80
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
81
81
  return super().run(**kwargs)
82
82
 
83
+ @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
84
+ def precheck(self) -> None:
85
+ super().precheck()
86
+
83
87
 
84
88
  @dataclass
85
89
  class GcsDownloaderConfig(FsspecDownloaderConfig):
@@ -117,6 +121,10 @@ class GcsUploader(FsspecUploader):
117
121
  def __post_init__(self):
118
122
  super().__post_init__()
119
123
 
124
+ @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
125
+ def precheck(self) -> None:
126
+ super().precheck()
127
+
120
128
  @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
121
129
  def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
122
130
  return super().run(contents=contents, **kwargs)
@@ -111,6 +111,10 @@ class S3Indexer(FsspecIndexer):
111
111
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
112
112
  return super().run(**kwargs)
113
113
 
114
+ @requires_dependencies(["s3fs", "fsspec"], extras="s3")
115
+ def precheck(self) -> None:
116
+ super().precheck()
117
+
114
118
 
115
119
  @dataclass
116
120
  class S3DownloaderConfig(FsspecDownloaderConfig):
@@ -144,6 +148,10 @@ class S3Uploader(FsspecUploader):
144
148
  connection_config: S3ConnectionConfig
145
149
  upload_config: S3UploaderConfig = field(default=None)
146
150
 
151
+ @requires_dependencies(["s3fs", "fsspec"], extras="s3")
152
+ def precheck(self) -> None:
153
+ super().precheck()
154
+
147
155
  @requires_dependencies(["s3fs", "fsspec"], extras="s3")
148
156
  def __post_init__(self):
149
157
  super().__post_init__()
@@ -91,6 +91,10 @@ class SftpIndexer(FsspecIndexer):
91
91
  file.identifier = new_identifier
92
92
  yield file
93
93
 
94
+ @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
95
+ def precheck(self) -> None:
96
+ super().precheck()
97
+
94
98
 
95
99
  @dataclass
96
100
  class SftpDownloaderConfig(FsspecDownloaderConfig):
@@ -142,6 +146,10 @@ class SftpUploader(FsspecUploader):
142
146
  def __post_init__(self):
143
147
  super().__post_init__()
144
148
 
149
+ @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
150
+ def precheck(self) -> None:
151
+ super().precheck()
152
+
145
153
  @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
146
154
  def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
147
155
  return super().run(contents=contents, **kwargs)
@@ -100,9 +100,15 @@ class OpenSearchConnectionConfig(ConnectionConfig):
100
100
  return OpenSearch(**self.get_client_kwargs())
101
101
 
102
102
 
103
+ @dataclass
104
+ class OpensearchIndexerConfig(ElasticsearchIndexerConfig):
105
+ pass
106
+
107
+
103
108
  @dataclass
104
109
  class OpenSearchIndexer(ElasticsearchIndexer):
105
110
  connection_config: OpenSearchConnectionConfig
111
+ index_config: OpensearchIndexerConfig
106
112
  client: "OpenSearch" = field(init=False)
107
113
 
108
114
  @requires_dependencies(["opensearchpy"], extras="opensearch")
@@ -112,9 +118,15 @@ class OpenSearchIndexer(ElasticsearchIndexer):
112
118
  return scan
113
119
 
114
120
 
121
+ @dataclass
122
+ class OpensearchDownloaderConfig(ElasticsearchDownloaderConfig):
123
+ pass
124
+
125
+
115
126
  @dataclass
116
127
  class OpenSearchDownloader(ElasticsearchDownloader):
117
128
  connection_config: OpenSearchConnectionConfig
129
+ download_config: OpensearchDownloaderConfig
118
130
  connector_type: str = CONNECTOR_TYPE
119
131
 
120
132
  @requires_dependencies(["opensearchpy"], extras="opensearch")
@@ -125,9 +137,15 @@ class OpenSearchDownloader(ElasticsearchDownloader):
125
137
  return AsyncOpenSearch, async_scan
126
138
 
127
139
 
140
+ @dataclass
141
+ class OpensearchUploaderConfig(ElasticsearchUploaderConfig):
142
+ pass
143
+
144
+
128
145
  @dataclass
129
146
  class OpenSearchUploader(ElasticsearchUploader):
130
147
  connection_config: OpenSearchConnectionConfig
148
+ upload_config: OpensearchUploaderConfig
131
149
  connector_type: str = CONNECTOR_TYPE
132
150
 
133
151
  @requires_dependencies(["opensearchpy"], extras="opensearch")
@@ -137,19 +155,29 @@ class OpenSearchUploader(ElasticsearchUploader):
137
155
  return parallel_bulk
138
156
 
139
157
 
158
+ @dataclass
159
+ class OpensearchUploadStagerConfig(ElasticsearchUploadStagerConfig):
160
+ pass
161
+
162
+
163
+ @dataclass
164
+ class OpensearchUploadStager(ElasticsearchUploadStager):
165
+ upload_stager_config: OpensearchUploadStagerConfig
166
+
167
+
140
168
  opensearch_source_entry = SourceRegistryEntry(
141
169
  connection_config=OpenSearchConnectionConfig,
142
170
  indexer=OpenSearchIndexer,
143
- indexer_config=ElasticsearchIndexerConfig,
171
+ indexer_config=OpensearchIndexerConfig,
144
172
  downloader=OpenSearchDownloader,
145
- downloader_config=ElasticsearchDownloaderConfig,
173
+ downloader_config=OpensearchDownloaderConfig,
146
174
  )
147
175
 
148
176
 
149
177
  opensearch_destination_entry = DestinationRegistryEntry(
150
178
  connection_config=OpenSearchConnectionConfig,
151
- upload_stager_config=ElasticsearchUploadStagerConfig,
152
- upload_stager=ElasticsearchUploadStager,
153
- uploader_config=ElasticsearchUploaderConfig,
179
+ upload_stager_config=OpensearchUploadStagerConfig,
180
+ upload_stager=OpensearchUploadStager,
181
+ uploader_config=OpensearchUploaderConfig,
154
182
  uploader=OpenSearchUploader,
155
183
  )
@@ -0,0 +1,175 @@
1
+ Metadata-Version: 2.1
2
+ Name: unstructured-ingest
3
+ Version: 0.0.3
4
+ Summary: A library that prepares raw documents for downstream ML tasks.
5
+ Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
+ Author: Unstructured Technologies
7
+ Author-email: devops@unstructuredai.io
8
+ License: Apache-2.0
9
+ Keywords: NLP PDF HTML CV XML parsing preprocessing
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Education
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: Apache Software License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Requires-Python: >=3.9.0,<3.13
23
+ Description-Content-Type: text/markdown
24
+ Requires-Dist: unstructured
25
+ Requires-Dist: python-dateutil
26
+ Requires-Dist: pandas
27
+ Provides-Extra: airtable
28
+ Requires-Dist: pyairtable ; extra == 'airtable'
29
+ Provides-Extra: astra
30
+ Requires-Dist: astrapy ; extra == 'astra'
31
+ Provides-Extra: azure
32
+ Requires-Dist: fsspec ; extra == 'azure'
33
+ Requires-Dist: adlfs ; extra == 'azure'
34
+ Provides-Extra: azure-cognitive-search
35
+ Requires-Dist: azure-search-documents ; extra == 'azure-cognitive-search'
36
+ Provides-Extra: bedrock
37
+ Requires-Dist: boto3 ; extra == 'bedrock'
38
+ Requires-Dist: langchain-community ; extra == 'bedrock'
39
+ Provides-Extra: biomed
40
+ Requires-Dist: bs4 ; extra == 'biomed'
41
+ Provides-Extra: box
42
+ Requires-Dist: fsspec ; extra == 'box'
43
+ Requires-Dist: boxfs ; extra == 'box'
44
+ Provides-Extra: chroma
45
+ Requires-Dist: typer <=0.9.0 ; extra == 'chroma'
46
+ Requires-Dist: importlib-metadata >=7.1.0 ; extra == 'chroma'
47
+ Requires-Dist: chromadb ; extra == 'chroma'
48
+ Provides-Extra: clarifai
49
+ Requires-Dist: clarifai ; extra == 'clarifai'
50
+ Provides-Extra: confluence
51
+ Requires-Dist: atlassian-python-api ; extra == 'confluence'
52
+ Provides-Extra: csv
53
+ Requires-Dist: unstructured[tsv] ; extra == 'csv'
54
+ Provides-Extra: databricks-volumes
55
+ Requires-Dist: databricks-sdk ; extra == 'databricks-volumes'
56
+ Provides-Extra: delta-table
57
+ Requires-Dist: fsspec ; extra == 'delta-table'
58
+ Requires-Dist: deltalake ; extra == 'delta-table'
59
+ Provides-Extra: discord
60
+ Requires-Dist: discord-py ; extra == 'discord'
61
+ Provides-Extra: doc
62
+ Requires-Dist: unstructured[docx] ; extra == 'doc'
63
+ Provides-Extra: docx
64
+ Requires-Dist: unstructured[docx] ; extra == 'docx'
65
+ Provides-Extra: dropbox
66
+ Requires-Dist: dropboxdrivefs ; extra == 'dropbox'
67
+ Requires-Dist: fsspec ; extra == 'dropbox'
68
+ Provides-Extra: elasticsearch
69
+ Requires-Dist: elasticsearch[async] ; extra == 'elasticsearch'
70
+ Provides-Extra: embed-huggingface
71
+ Requires-Dist: sentence-transformers ; extra == 'embed-huggingface'
72
+ Requires-Dist: langchain-community ; extra == 'embed-huggingface'
73
+ Requires-Dist: huggingface ; extra == 'embed-huggingface'
74
+ Provides-Extra: embed-octoai
75
+ Requires-Dist: tiktoken ; extra == 'embed-octoai'
76
+ Requires-Dist: openai ; extra == 'embed-octoai'
77
+ Provides-Extra: embed-vertexai
78
+ Requires-Dist: langchain ; extra == 'embed-vertexai'
79
+ Requires-Dist: langchain-community ; extra == 'embed-vertexai'
80
+ Requires-Dist: langchain-google-vertexai ; extra == 'embed-vertexai'
81
+ Provides-Extra: embed-voyageai
82
+ Requires-Dist: langchain ; extra == 'embed-voyageai'
83
+ Requires-Dist: langchain-voyageai ; extra == 'embed-voyageai'
84
+ Provides-Extra: epub
85
+ Requires-Dist: unstructured[epub] ; extra == 'epub'
86
+ Provides-Extra: gcs
87
+ Requires-Dist: fsspec ; extra == 'gcs'
88
+ Requires-Dist: bs4 ; extra == 'gcs'
89
+ Requires-Dist: gcsfs ; extra == 'gcs'
90
+ Provides-Extra: github
91
+ Requires-Dist: pygithub >1.58.0 ; extra == 'github'
92
+ Provides-Extra: gitlab
93
+ Requires-Dist: python-gitlab ; extra == 'gitlab'
94
+ Provides-Extra: google-drive
95
+ Requires-Dist: google-api-python-client ; extra == 'google-drive'
96
+ Provides-Extra: hubspot
97
+ Requires-Dist: urllib3 ; extra == 'hubspot'
98
+ Requires-Dist: hubspot-api-client ; extra == 'hubspot'
99
+ Provides-Extra: jira
100
+ Requires-Dist: atlassian-python-api ; extra == 'jira'
101
+ Provides-Extra: kafka
102
+ Requires-Dist: confluent-kafka ; extra == 'kafka'
103
+ Provides-Extra: md
104
+ Requires-Dist: unstructured[md] ; extra == 'md'
105
+ Provides-Extra: milvus
106
+ Requires-Dist: pymilvus ; extra == 'milvus'
107
+ Provides-Extra: mongodb
108
+ Requires-Dist: pymongo ; extra == 'mongodb'
109
+ Provides-Extra: msg
110
+ Requires-Dist: unstructured[msg] ; extra == 'msg'
111
+ Provides-Extra: notion
112
+ Requires-Dist: notion-client ; extra == 'notion'
113
+ Requires-Dist: htmlBuilder ; extra == 'notion'
114
+ Provides-Extra: odt
115
+ Requires-Dist: unstructured[odt] ; extra == 'odt'
116
+ Provides-Extra: onedrive
117
+ Requires-Dist: bs4 ; extra == 'onedrive'
118
+ Requires-Dist: msal ; extra == 'onedrive'
119
+ Requires-Dist: Office365-REST-Python-Client ; extra == 'onedrive'
120
+ Provides-Extra: openai
121
+ Requires-Dist: tiktoken ; extra == 'openai'
122
+ Requires-Dist: langchain-community ; extra == 'openai'
123
+ Requires-Dist: openai ; extra == 'openai'
124
+ Provides-Extra: opensearch
125
+ Requires-Dist: opensearch-py ; extra == 'opensearch'
126
+ Provides-Extra: org
127
+ Requires-Dist: unstructured[org] ; extra == 'org'
128
+ Provides-Extra: outlook
129
+ Requires-Dist: msal ; extra == 'outlook'
130
+ Requires-Dist: Office365-REST-Python-Client ; extra == 'outlook'
131
+ Provides-Extra: pdf
132
+ Requires-Dist: unstructured[pdf] ; extra == 'pdf'
133
+ Provides-Extra: pinecone
134
+ Requires-Dist: pinecone-client >=3.7.1 ; extra == 'pinecone'
135
+ Provides-Extra: postgres
136
+ Requires-Dist: psycopg2-binary ; extra == 'postgres'
137
+ Provides-Extra: ppt
138
+ Requires-Dist: unstructured[pptx] ; extra == 'ppt'
139
+ Provides-Extra: pptx
140
+ Requires-Dist: unstructured[pptx] ; extra == 'pptx'
141
+ Provides-Extra: qdrant
142
+ Requires-Dist: qdrant-client ; extra == 'qdrant'
143
+ Provides-Extra: reddit
144
+ Requires-Dist: praw ; extra == 'reddit'
145
+ Provides-Extra: rst
146
+ Requires-Dist: unstructured[rst] ; extra == 'rst'
147
+ Provides-Extra: rtf
148
+ Requires-Dist: unstructured[rtf] ; extra == 'rtf'
149
+ Provides-Extra: s3
150
+ Requires-Dist: fsspec ; extra == 's3'
151
+ Requires-Dist: s3fs ; extra == 's3'
152
+ Provides-Extra: salesforce
153
+ Requires-Dist: simple-salesforce ; extra == 'salesforce'
154
+ Provides-Extra: sftp
155
+ Requires-Dist: fsspec ; extra == 'sftp'
156
+ Requires-Dist: paramiko ; extra == 'sftp'
157
+ Provides-Extra: sharepoint
158
+ Requires-Dist: msal ; extra == 'sharepoint'
159
+ Requires-Dist: Office365-REST-Python-Client ; extra == 'sharepoint'
160
+ Provides-Extra: singlestore
161
+ Requires-Dist: singlestoredb ; extra == 'singlestore'
162
+ Provides-Extra: slack
163
+ Requires-Dist: slack-sdk ; extra == 'slack'
164
+ Provides-Extra: tsv
165
+ Requires-Dist: unstructured[tsv] ; extra == 'tsv'
166
+ Provides-Extra: weaviate
167
+ Requires-Dist: weaviate-client ; extra == 'weaviate'
168
+ Provides-Extra: wikipedia
169
+ Requires-Dist: wikipedia ; extra == 'wikipedia'
170
+ Provides-Extra: xlsx
171
+ Requires-Dist: unstructured[xlsx] ; extra == 'xlsx'
172
+
173
+ # Unstructured Ingest
174
+
175
+ For details, see the [Unstructured Ingest overview](https://docs.unstructured.io/ingestion/overview) in the Unstructured documentation.
@@ -1,5 +1,5 @@
1
1
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
2
- unstructured_ingest/__version__.py,sha256=t0CFzEk7qlIWbgyEWA53ytTKmHbZ9ow2lAyjeP1bFqw,42
2
+ unstructured_ingest/__version__.py,sha256=lgN1tyYZ7PvP0gksBpCmmobQNtds35hHWOpR04r0Z0g,42
3
3
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
4
4
  unstructured_ingest/evaluate.py,sha256=R-mKLFXbVX1xQ1tjGsLHjdP-TbSSV-925IHzggW_bIg,9793
5
5
  unstructured_ingest/interfaces.py,sha256=uS8L5mS0mXD8I4XTfVlKZxAwqnpJ4yrRqn4vxWVRhQI,31107
@@ -301,7 +301,7 @@ unstructured_ingest/v2/interfaces/connector.py,sha256=u4hE1DpTPDC04-n_IzYyn9w1gN
301
301
  unstructured_ingest/v2/interfaces/downloader.py,sha256=zs7cxhzbWVc5L0bV4gdCTexWGMVeXTQ9jJF6PCYSAss,2790
302
302
  unstructured_ingest/v2/interfaces/file_data.py,sha256=PZrPJBkNC63lNO_1nwvnAeKRxjM3CsjIY6jSO8T9bVM,1665
303
303
  unstructured_ingest/v2/interfaces/indexer.py,sha256=pMw0abNHk_tEuA4BkXX1BdAfIwHdytxj7s6tGxMvYRE,821
304
- unstructured_ingest/v2/interfaces/process.py,sha256=_l4dyaM0u0XxTqQw1Ghr8k2QMpQJMFapLOLhWqSdTdo,512
304
+ unstructured_ingest/v2/interfaces/process.py,sha256=BgglTu5K93FnDDopZKKr_rkK2LTZOguR6kcQjKHjF40,392
305
305
  unstructured_ingest/v2/interfaces/processor.py,sha256=uHVHeKo5Gt_zFkaEXw7xgaCBDTEl2-Amh-ByA07258o,1620
306
306
  unstructured_ingest/v2/interfaces/upload_stager.py,sha256=SylhDl9pK6qa7hvfrhpabCkjwE03yIlI6oM-mQnqtho,1220
307
307
  unstructured_ingest/v2/interfaces/uploader.py,sha256=bzfx3Ei4poXKu-hsgjAB4sj4jKij9CoaRSadUM5LtGk,1083
@@ -337,7 +337,7 @@ unstructured_ingest/v2/processes/connectors/local.py,sha256=IJ5DjASp-5lPmb6J7Y8N
337
337
  unstructured_ingest/v2/processes/connectors/milvus.py,sha256=FWH4FH-zns7gh8sITg9pLYE9uKm_3GeOXJ4wjY6PMno,6776
338
338
  unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=XZCgkF28HCR4DtMmr8jlxb59txXgEvfCabovROUrv6Y,4602
339
339
  unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=_TFO-vlyCxIxMk6hv20CEsicrlh87wCrbi4I1chsMUw,8822
340
- unstructured_ingest/v2/processes/connectors/opensearch.py,sha256=HNRZVQsWnjLLm0yAGiIyHRbhAsBnGSXBO_VkUfIdwdE,5463
340
+ unstructured_ingest/v2/processes/connectors/opensearch.py,sha256=5L_cE6kaJfmzjsK2pNNJOetntcwLsgnEvHiC9cVrZPg,6049
341
341
  unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=0rpOh_pi4GormyIQsnEJbKVb7FeizAbLcbljpnjtpeY,5908
342
342
  unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=S0dEjT1UxReCC6qE9DlSQBgcSzQbOaIq7SMJqXUpNWQ,10858
343
343
  unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=NRn0lbOuXqIYqZT15IVFeFQCxpCKzZC_M8pVYZeeNfo,17933
@@ -346,16 +346,16 @@ unstructured_ingest/v2/processes/connectors/sql.py,sha256=mbhBI2tcX4q1YJwR3Nr7HG
346
346
  unstructured_ingest/v2/processes/connectors/utils.py,sha256=nmpZZCeX0O7rGrwHSWM_heBgpZK9tKT6EV1Moer-z40,576
347
347
  unstructured_ingest/v2/processes/connectors/weaviate.py,sha256=HtJuOUhBs_HA7uOXlEIuYtx0elb0ecsCvP8N822tOMQ,8564
348
348
  unstructured_ingest/v2/processes/connectors/fsspec/__init__.py,sha256=TtdeImM7Ypl_n6sl7I1JqX6bGSG0t_FqvCqE3Cy24og,1846
349
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py,sha256=RN7zoifocIWVgoP9aMDMz4TP-Z9KhE-HbCCBq33fY90,4674
350
- unstructured_ingest/v2/processes/connectors/fsspec/box.py,sha256=UnD-F9g7yOOBStrAqeKq6GuQjEyHdwOA3jYLj8YZIRM,4088
351
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py,sha256=I6mPG9EIso9TcIczCw5Y14Yqd-EhTQ2CLw1MJx1V3dY,4420
352
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=zKrwKTVGnhnitD8h_Url5HRFsJZjM66o3jWrzAm-_UA,12153
353
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=RYZq_8hKF7bRxuB5Gozv5AzB3_nTuuooE4UfRjXwEFU,4443
354
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=PXK9a5O3woDuBWSf4R5XLQI5mzHtap8wAKpHI8Rh5gQ,5462
355
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=J7Ej-j7dtXAluHunwynUfHlNsYwymb-LsrGUFcljcsA,5700
349
+ unstructured_ingest/v2/processes/connectors/fsspec/azure.py,sha256=liW0e8xe6iPYQTRqSL-L5YGXBdRrjqDDa_KE9a73x8Q,4922
350
+ unstructured_ingest/v2/processes/connectors/fsspec/box.py,sha256=ef1g8YLfGibaOmfOObXlDhoSF92ZIM1BGCo-FwLqCFQ,4312
351
+ unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py,sha256=sai1xeNwN2mQWqHe87GLUTMzad69-ida30MX6N7eTG0,4690
352
+ unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=sfUreguw-End-MytYmNvdB9oIDlvW10Ib_g9_Cn1uac,12756
353
+ unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=LFAHXtb2yzbYd00mytUnVuaw54O0bHsLJ5rWXdNYU90,4687
354
+ unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=lIgPZdC0ErFERfh3hCUUTZcj3Q-O_36rfgupy2LpzrQ,5702
355
+ unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=e3Zhl2Ulbf9j7YxJUL6MVZu15rrvmhNyPkACStxLv4U,5952
356
356
  unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
357
- unstructured_ingest-0.0.2.dist-info/METADATA,sha256=a68Sz8-m1-ZRFz0p4yic64BhgwTuMdIMmCuPECdhWwA,21568
358
- unstructured_ingest-0.0.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
359
- unstructured_ingest-0.0.2.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
360
- unstructured_ingest-0.0.2.dist-info/top_level.txt,sha256=QaTxTcjfM5Hr9sZJ6weOJvSe5ESQc0F8AWkhHInTCf8,20
361
- unstructured_ingest-0.0.2.dist-info/RECORD,,
357
+ unstructured_ingest-0.0.3.dist-info/METADATA,sha256=j2DLPr0il-IIdJfQZI9iwl_VTf5ADuZW6E9oZYyGQ40,6987
358
+ unstructured_ingest-0.0.3.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
359
+ unstructured_ingest-0.0.3.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
360
+ unstructured_ingest-0.0.3.dist-info/top_level.txt,sha256=QaTxTcjfM5Hr9sZJ6weOJvSe5ESQc0F8AWkhHInTCf8,20
361
+ unstructured_ingest-0.0.3.dist-info/RECORD,,
@@ -1,321 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: unstructured-ingest
3
- Version: 0.0.2
4
- Summary: A library that prepares raw documents for downstream ML tasks.
5
- Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
- Author: Unstructured Technologies
7
- Author-email: devops@unstructuredai.io
8
- License: Apache-2.0
9
- Keywords: NLP PDF HTML CV XML parsing preprocessing
10
- Classifier: Development Status :: 4 - Beta
11
- Classifier: Intended Audience :: Developers
12
- Classifier: Intended Audience :: Education
13
- Classifier: Intended Audience :: Science/Research
14
- Classifier: License :: OSI Approved :: Apache Software License
15
- Classifier: Operating System :: OS Independent
16
- Classifier: Programming Language :: Python :: 3
17
- Classifier: Programming Language :: Python :: 3.9
18
- Classifier: Programming Language :: Python :: 3.10
19
- Classifier: Programming Language :: Python :: 3.11
20
- Classifier: Programming Language :: Python :: 3.12
21
- Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
- Requires-Python: >=3.9.0,<3.13
23
- Description-Content-Type: text/markdown
24
- Requires-Dist: unstructured
25
- Requires-Dist: python-dateutil
26
- Requires-Dist: pandas
27
- Provides-Extra: airtable
28
- Requires-Dist: pyairtable ; extra == 'airtable'
29
- Provides-Extra: astra
30
- Requires-Dist: astrapy ; extra == 'astra'
31
- Provides-Extra: azure
32
- Requires-Dist: fsspec ; extra == 'azure'
33
- Requires-Dist: adlfs ; extra == 'azure'
34
- Provides-Extra: azure-cognitive-search
35
- Requires-Dist: azure-search-documents ; extra == 'azure-cognitive-search'
36
- Provides-Extra: bedrock
37
- Requires-Dist: langchain-community ; extra == 'bedrock'
38
- Requires-Dist: boto3 ; extra == 'bedrock'
39
- Provides-Extra: biomed
40
- Requires-Dist: bs4 ; extra == 'biomed'
41
- Provides-Extra: box
42
- Requires-Dist: boxfs ; extra == 'box'
43
- Requires-Dist: fsspec ; extra == 'box'
44
- Provides-Extra: chroma
45
- Requires-Dist: importlib-metadata >=7.1.0 ; extra == 'chroma'
46
- Requires-Dist: typer <=0.9.0 ; extra == 'chroma'
47
- Requires-Dist: chromadb ; extra == 'chroma'
48
- Provides-Extra: clarifai
49
- Requires-Dist: clarifai ; extra == 'clarifai'
50
- Provides-Extra: confluence
51
- Requires-Dist: atlassian-python-api ; extra == 'confluence'
52
- Provides-Extra: csv
53
- Requires-Dist: unstructured[tsv] ; extra == 'csv'
54
- Provides-Extra: databricks-volumes
55
- Requires-Dist: databricks-sdk ; extra == 'databricks-volumes'
56
- Provides-Extra: delta-table
57
- Requires-Dist: deltalake ; extra == 'delta-table'
58
- Requires-Dist: fsspec ; extra == 'delta-table'
59
- Provides-Extra: discord
60
- Requires-Dist: discord-py ; extra == 'discord'
61
- Provides-Extra: doc
62
- Requires-Dist: unstructured[docx] ; extra == 'doc'
63
- Provides-Extra: docx
64
- Requires-Dist: unstructured[docx] ; extra == 'docx'
65
- Provides-Extra: dropbox
66
- Requires-Dist: dropboxdrivefs ; extra == 'dropbox'
67
- Requires-Dist: fsspec ; extra == 'dropbox'
68
- Provides-Extra: elasticsearch
69
- Requires-Dist: elasticsearch[async] ; extra == 'elasticsearch'
70
- Provides-Extra: embed-huggingface
71
- Requires-Dist: huggingface ; extra == 'embed-huggingface'
72
- Requires-Dist: sentence-transformers ; extra == 'embed-huggingface'
73
- Requires-Dist: langchain-community ; extra == 'embed-huggingface'
74
- Provides-Extra: embed-octoai
75
- Requires-Dist: tiktoken ; extra == 'embed-octoai'
76
- Requires-Dist: openai ; extra == 'embed-octoai'
77
- Provides-Extra: embed-vertexai
78
- Requires-Dist: langchain-community ; extra == 'embed-vertexai'
79
- Requires-Dist: langchain ; extra == 'embed-vertexai'
80
- Requires-Dist: langchain-google-vertexai ; extra == 'embed-vertexai'
81
- Provides-Extra: embed-voyageai
82
- Requires-Dist: langchain ; extra == 'embed-voyageai'
83
- Requires-Dist: langchain-voyageai ; extra == 'embed-voyageai'
84
- Provides-Extra: epub
85
- Requires-Dist: unstructured[epub] ; extra == 'epub'
86
- Provides-Extra: gcs
87
- Requires-Dist: bs4 ; extra == 'gcs'
88
- Requires-Dist: gcsfs ; extra == 'gcs'
89
- Requires-Dist: fsspec ; extra == 'gcs'
90
- Provides-Extra: github
91
- Requires-Dist: pygithub >1.58.0 ; extra == 'github'
92
- Provides-Extra: gitlab
93
- Requires-Dist: python-gitlab ; extra == 'gitlab'
94
- Provides-Extra: google-drive
95
- Requires-Dist: google-api-python-client ; extra == 'google-drive'
96
- Provides-Extra: hubspot
97
- Requires-Dist: hubspot-api-client ; extra == 'hubspot'
98
- Requires-Dist: urllib3 ; extra == 'hubspot'
99
- Provides-Extra: jira
100
- Requires-Dist: atlassian-python-api ; extra == 'jira'
101
- Provides-Extra: kafka
102
- Requires-Dist: confluent-kafka ; extra == 'kafka'
103
- Provides-Extra: md
104
- Requires-Dist: unstructured[md] ; extra == 'md'
105
- Provides-Extra: milvus
106
- Requires-Dist: pymilvus ; extra == 'milvus'
107
- Provides-Extra: mongodb
108
- Requires-Dist: pymongo ; extra == 'mongodb'
109
- Provides-Extra: msg
110
- Requires-Dist: unstructured[msg] ; extra == 'msg'
111
- Provides-Extra: notion
112
- Requires-Dist: htmlBuilder ; extra == 'notion'
113
- Requires-Dist: notion-client ; extra == 'notion'
114
- Provides-Extra: odt
115
- Requires-Dist: unstructured[odt] ; extra == 'odt'
116
- Provides-Extra: onedrive
117
- Requires-Dist: msal ; extra == 'onedrive'
118
- Requires-Dist: bs4 ; extra == 'onedrive'
119
- Requires-Dist: Office365-REST-Python-Client ; extra == 'onedrive'
120
- Provides-Extra: openai
121
- Requires-Dist: tiktoken ; extra == 'openai'
122
- Requires-Dist: openai ; extra == 'openai'
123
- Requires-Dist: langchain-community ; extra == 'openai'
124
- Provides-Extra: opensearch
125
- Requires-Dist: opensearch-py ; extra == 'opensearch'
126
- Provides-Extra: org
127
- Requires-Dist: unstructured[org] ; extra == 'org'
128
- Provides-Extra: outlook
129
- Requires-Dist: msal ; extra == 'outlook'
130
- Requires-Dist: Office365-REST-Python-Client ; extra == 'outlook'
131
- Provides-Extra: pdf
132
- Requires-Dist: unstructured[pdf] ; extra == 'pdf'
133
- Provides-Extra: pinecone
134
- Requires-Dist: pinecone-client >=3.7.1 ; extra == 'pinecone'
135
- Provides-Extra: postgres
136
- Requires-Dist: psycopg2-binary ; extra == 'postgres'
137
- Provides-Extra: ppt
138
- Requires-Dist: unstructured[pptx] ; extra == 'ppt'
139
- Provides-Extra: pptx
140
- Requires-Dist: unstructured[pptx] ; extra == 'pptx'
141
- Provides-Extra: qdrant
142
- Requires-Dist: qdrant-client ; extra == 'qdrant'
143
- Provides-Extra: reddit
144
- Requires-Dist: praw ; extra == 'reddit'
145
- Provides-Extra: rst
146
- Requires-Dist: unstructured[rst] ; extra == 'rst'
147
- Provides-Extra: rtf
148
- Requires-Dist: unstructured[rtf] ; extra == 'rtf'
149
- Provides-Extra: s3
150
- Requires-Dist: s3fs ; extra == 's3'
151
- Requires-Dist: fsspec ; extra == 's3'
152
- Provides-Extra: salesforce
153
- Requires-Dist: simple-salesforce ; extra == 'salesforce'
154
- Provides-Extra: sftp
155
- Requires-Dist: paramiko ; extra == 'sftp'
156
- Requires-Dist: fsspec ; extra == 'sftp'
157
- Provides-Extra: sharepoint
158
- Requires-Dist: msal ; extra == 'sharepoint'
159
- Requires-Dist: Office365-REST-Python-Client ; extra == 'sharepoint'
160
- Provides-Extra: singlestore
161
- Requires-Dist: singlestoredb ; extra == 'singlestore'
162
- Provides-Extra: slack
163
- Requires-Dist: slack-sdk ; extra == 'slack'
164
- Provides-Extra: tsv
165
- Requires-Dist: unstructured[tsv] ; extra == 'tsv'
166
- Provides-Extra: weaviate
167
- Requires-Dist: weaviate-client ; extra == 'weaviate'
168
- Provides-Extra: wikipedia
169
- Requires-Dist: wikipedia ; extra == 'wikipedia'
170
- Provides-Extra: xlsx
171
- Requires-Dist: unstructured[xlsx] ; extra == 'xlsx'
172
-
173
- # Batch Processing Documents [DEPRECATED]
174
- For the latest approach, go to: [v2](./v2)
175
-
176
- ## The unstructured-ingest CLI
177
-
178
- The unstructured library includes a CLI to batch ingest documents from various sources, storing structured outputs locally on the filesystem.
179
-
180
- For example, the following command processes all the documents in S3 in the
181
- `utic-dev-tech-fixtures` bucket with a prefix of `small-pdf-set/`.
182
-
183
- unstructured-ingest \
184
- s3 \
185
- --remote-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
186
- --anonymous \
187
- --output-dir s3-small-batch-output \
188
- --num-processes 2
189
-
190
- Naturally, --num-processes may be adjusted for better instance utilization with multiprocessing.
191
-
192
- Installation note: make sure to install the following extras when installing unstructured, needed for the above command:
193
-
194
- pip install "unstructured[s3,local-inference]"
195
-
196
- See the [Quick Start](https://github.com/Unstructured-IO/unstructured#eight_pointed_black_star-quick-start) which documents how to pip install `dectectron2` and other OS dependencies, necessary for the parsing of .PDF files.
197
-
198
- # Developers' Guide
199
-
200
- ## Local testing
201
-
202
- When testing from a local checkout rather than a pip-installed version of `unstructured`,
203
- just execute `unstructured_ingest/main.py`, e.g.:
204
-
205
- PYTHONPATH=. ./unstructured_ingest/main.py \
206
- s3 \
207
- --remote-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
208
- --anonymous \
209
- --output-dir s3-small-batch-output \
210
- --num-processes 2
211
-
212
- ## Adding Source Data Connectors
213
-
214
- To add a connector, refer to [unstructured_ingest/connector/github.py](unstructured_ingest/connector/github.py) as an example that implements the three relevant abstract base classes.
215
-
216
- If the connector has an available `fsspec` implementation, then refer to [unstructured_ingest/connector/s3.py](unstructured_ingest/connector/s3.py).
217
-
218
- Then, update [unstructured_ingest/main.py/cli](unstructured_ingest/cli) to add a subcommand associated with the connector, and hook it up to the parent group.
219
-
220
- Add an implementation of `BaseRunner` in the runner directory to connect the invocation of the CLI with the underlying connector created.
221
-
222
- Create at least one folder [examples/ingest](examples/ingest) with an easily reproducible
223
- script that shows the new connector in action.
224
-
225
- Finally, to ensure the connector remains stable, add a new script test_unstructured_ingest/test-ingest-\<the-new-data-source\>.sh similar to [test_unstructured_ingest/test-ingest-s3.sh](test_unstructured_ingest/test-ingest-s3.sh), and append a line invoking the new script in [test_unstructured_ingest/test-ingest.sh](test_unstructured_ingest/test-ingest.sh).
226
-
227
- You'll notice that the unstructured outputs for the new documents are expected
228
- to be checked into CI under test_unstructured_ingest/expected-structured-output/\<folder-name-relevant-to-your-dataset\>. So, you'll need to `git add` those json outputs so that `test-ingest.sh` passes in CI.
229
-
230
- The `main.py` flags of --re-download/--no-re-download , --download-dir, --preserve-downloads, --structured-output-dir, and --reprocess are honored by the connector.
231
-
232
- ## Adding Destination Data Connectors
233
-
234
- To add a destination connector, refer to [unstructured_ingest/connector/delta-table.py](unstructured_ingest/connector/delta-table.py) as an example, which extends the `BaseDestinationConnector`, and the `WriteConfig`. It also shows how an existing data provider can be used for both a source and destination connector.
235
-
236
- Similar to the runner used to connect source connectors with the CLI, destination connectors require an entry in the writer map defined in [unstructured_ingest/runner/writers.py](unstructured_ingest/runner/writers.py). This allows any source connector to use any destination connector.
237
-
238
- Regarding the entry in the CLI, destination connectors are exposed as a subcommand that gets added to each source connector parent command. Special care needs to be taken here to not break the code being run by the source connector. Take a look at how the base runner class is dynamically pulled using the name of the parent CLI command in [unstructured_ingest/cli/cmds/delta_table.py](unstructured_ingest/cli/cmds/delta_table.py).
239
-
240
- Similar tests and examples should be added to demonstrate/validate the use of the destination connector similar to the steps laid out for a source connector.
241
-
242
- ### The checklist:
243
-
244
- In checklist form, the above steps are summarized as:
245
-
246
- - [ ] Create a new module under [unstructured_ingest/connector/](unstructured_ingest/connector/) implementing the 3 abstract base classes, similar to [unstructured_ingest/connector/github.py](unstructured_ingest/connector/github.py).
247
- - [ ] The subclass of `BaseIngestDoc` overrides `process_file()` if extra processing logic is needed other than what is provided by [auto.partition()](unstructured/partition/auto.py).
248
- - [ ] If the IngestDoc relies on a connection or session that could be reused, the subclass of `BaseConnectorConfig` implements a session handle to manage connections. The ConnectorConfig subclass should also inherit from `ConfigSessionHandleMixin` and the IngestDoc subclass should also inherit from `IngestDocSessionHandleMixin`. Check [here](https://github.com/Unstructured-IO/unstructured/pull/1058/files#diff-dae96d30f58cffe1b348c036d006b48bdc7e2e47fbd7c8ec1c45d63face1542d) for a detailed example.
249
- - [ ] The subclass of `BaseIngestDoc` implements relevant data source properties to include metadata. Check [this PR](https://github.com/Unstructured-IO/unstructured/pull/1283) for detailed examples.
250
- - [ ] The field `record_locator` property should include all of the information required to be able to reach to the document in the source platform.
251
- - [ ] Add the relevant decorators from `unstructured.ingest.error` on top of relevant methods to handle errors such as a source connection error, destination connection error, or a partition error. For examples, check [here](https://github.com/Unstructured-IO/unstructured/commit/92692ad8d7d5001601dd88fef869a29660f492cb).
252
- - [ ] Update [unstructured_ingest/cli](unstructured_ingest/cli) with support for the new connector.
253
- - [ ] Create a folder under [examples/ingest](examples/ingest) that includes at least one well documented script.
254
- - [ ] Add a script test_unstructured_ingest/test-ingest-\<the-new-data-source\>.sh. It's json output files should have a total of no more than 100K.
255
- - [ ] Git add the expected outputs under test_unstructured_ingest/expected-structured-output/\<folder-name-relevant-to-your-dataset\> so the above test passes in CI.
256
- - [ ] Add a line to [test_unstructured_ingest/test-ingest.sh](test_unstructured_ingest/test-ingest.sh) invoking the new test script.
257
- - [ ] Make sure the tests for the connector are running and not skipped by reviewing the logs in CI.
258
- - [ ] If additional python dependencies are needed for the new connector:
259
- - [ ] Add them as an extra to [setup.py](unstructured/setup.py).
260
- - [ ] Update the Makefile, adding a target for `install-ingest-<name>` and adding another `pip-compile` line to the `pip-compile` make target. See [this commit](https://github.com/Unstructured-IO/unstructured/commit/ab542ca3c6274f96b431142262d47d727f309e37) for a reference.
261
- - [ ] The added dependencies should be imported at runtime when the new connector is invoked, rather than as top-level imports.
262
- - [ ] Add the decorator `unstructured.utils.requires_dependencies` on top of each class instance or function that uses those connector-specific dependencies e.g. for `GitHubConnector` should look like `@requires_dependencies(dependencies=["github"], extras="github")`
263
- - [ ] Run `make tidy` and `make check` to ensure linting checks pass.
264
- - [ ] Update ingest documentation [here](https://github.com/Unstructured-IO/unstructured/tree/main/docs/source)
265
- - [ ] For team members that are developing in the original repository:
266
- - [ ] If there are secret variables created for the connector tests, make sure to:
267
- - [ ] add the secrets into Github (contact someone with access)
268
- - [ ] include the secret variables in [`ci.yml`](https://github.com/Unstructured-IO/unstructured/blob/main/.github/workflows/ci.yml) and [`ingest-test-fixtures-update-pr.yml`](https://github.com/Unstructured-IO/unstructured/blob/main/.github/workflows/ingest-test-fixtures-update-pr.yml)
269
- - [ ] add a make install line in the workflow configurations to be able to provide the workflow machine with the required dependencies on the connector while testing
270
- - [ ] Whenever necessary, use the [ingest update test fixtures](https://github.com/Unstructured-IO/unstructured/actions/workflows/ingest-test-fixtures-update-pr.yml) workflow to update the test fixtures.
271
- - [ ] Honors the conventions of `BaseConnectorConfig` defined in [unstructured_ingest/interfaces.py](unstructured_ingest/interfaces.py) which is passed through [the CLI](unstructured_ingest/main.py):
272
- - [ ] If running with an `.output_dir` where structured outputs already exists for a given file, the file content is not re-downloaded from the data source nor is it reprocessed. This is made possible by implementing the call to `MyIngestDoc.has_output()` which is invoked in [MainProcess._filter_docs_with_outputs](ingest-prep-for-many/unstructured_ingest/main.py).
273
- - [ ] Unless `.reprocess` is `True`, then documents are always reprocessed.
274
- - [ ] If `.preserve_download` is `True`, documents downloaded to `.download_dir` are not removed after processing.
275
- - [ ] Else if `.preserve_download` is `False`, documents downloaded to `.download_dir` are removed after they are **successfully** processed during the invocation of `MyIngestDoc.cleanup_file()` in [process_document](unstructured_ingest/doc_processor/generalized.py)
276
- - [ ] Does not re-download documents to `.download_dir` if `.re_download` is False, enforced in `MyIngestDoc.get_file()`
277
- - [ ] Prints more details if `--verbose` in ingest CLI, similar to [unstructured_ingest/connector/github.py](unstructured_ingest/connector/github.py) logging messages.
278
-
279
- ## Design References
280
-
281
- `unstructured_ingest/main.py` is the entrypoint for the `unstructured-ingest` cli. It calls the cli Command as fetched from `cli.py` `get_cmd()`.
282
-
283
- The ingest directory is broken up in such a way that most of the code can be used with or without invoking the CLI itself:
284
-
285
- * **Connector:** This houses the main code that is responsible for reaching out to external data providers and pulling down the data (i.e. S3, Azure, etc)
286
- * **Runner:** This serves as the interface between the CLI specific commands and running the connector code. A base runner class exists that defines much of the common functionality across all connectors and allowed for typed methods to be defined to explicitly connect the CLI command to the specific connector.
287
- * **CLI:** This is where the `Click` python library is introduced to create the cli bindings that a user interacts with then invoking the CLI directly. Many of the common options across commands are abstracted away and add options dynamically to click commands.
288
-
289
- The ingest flow is similar to an ETL pipeline that gets defined at runtime based on user input:
290
-
291
- ![unstructured ingest cli diagram](img/unstructured_ingest_cli_pipeline_diagram.png)
292
-
293
- Each step in the pipeline caches the results in a default location if one is not provided to it. This allows for the pipeline
294
- to pick up where it ended if an error occurred before it finished without having to recompute everything that ran successfully.
295
- It uses a hash of the parameters passed in for each step along with the previous step to know if the results it already has are
296
- still valid or should be recomputed regardless of them existing already. This allows you to change parameters associated with a
297
- step in the tail end of the pipeline and it only recomputes from there.
298
-
299
- **Multiprocessing:** One of the options for the pipeline is how many processes to use. Not all steps support multiprocessing, but if they do, a multiprocessing Pool is used to speed up the process. For debugging purposes, if a single process is set, multiprocessing Pool isn't used at all.
300
-
301
- While all the configurations are added to a single Click command when the CLI is invoked as options, many of these are bundled together based on a particular step in the pipeline. A `BaseConfig`
302
- is extended in the root interfaces file and then that can be extended once again in the cli-specific interfaces file which adds a function on how the fields in the base config should be mapped to `Click` options.
303
- ### Configs
304
- * `PartitionConfig`: Data associated with running the partitioning over the files pulled down via the source connector.
305
- * `ProcessorConfig`: Data around the process as a whole, such as number of processes to use when running, where to store the final result of the pipeline, and if an error should be raised if a single doc fails. By default, the pipeline will continue with that it can, so if a doc fails out of many, an error will be logged and the rest will continue.
306
- * `ReadConfig`: Data associated with pulling the data from the source data provider, such as if it should be redownloaded, regardless of the files already existing.
307
- * `EmbeddingConfig`: Data associated with running an optional embedder on the data, which adds a new field to the output json for each element with it's associated embeddings vector.
308
- * `ChunkingConfig`: Data associated with running an optional chunker over the partitioned data.
309
- * `PermissionsConfig`: Data associated with pulling down permissions data (i.e. RBAC). This is an optional feature and if enabled, will append the information pulled down to the metadata associated with an element.
310
- * `WriteConfig`: Any specific data needed to write to a destination connector. This does not have to be used if not needed.
311
-
312
- For the flow of the pipeline, the only required steps are:
313
- * **Doc Factory:** This creates instances of `BaseIngestDoc` which provide references to a file on the source data provider without downloading anything yet.
314
- * **Source Node:** This is responsible for downloading and content and producing a representation of that content suitable for partitioning.
315
- * **Partitioner:** Responsible for running partition over the content produced by the previous source node.
316
-
317
- Optional Steps:
318
- * **Reformat Nodes:** Any number of reformat nodes can be set to modify the partitioned content. Currently chunking and embedding are supported.
319
- * **Write Node:** If set, write the results to a destination via a destination connector.
320
-
321
- Because there can be any number of reformat nodes, the final destination is not deterministic, so an extra step is added at the end of all reformat nodes to copy the final result to the location the user expects it to be when the pipeline ends.