unstructured-ingest 0.7.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (192) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/README.md +28 -0
  3. unstructured_ingest/embed/mixedbreadai.py +0 -1
  4. unstructured_ingest/interfaces/upload_stager.py +2 -2
  5. unstructured_ingest/interfaces/uploader.py +3 -3
  6. unstructured_ingest/logger.py +2 -93
  7. unstructured_ingest/main.py +0 -0
  8. unstructured_ingest/pipeline/interfaces.py +1 -1
  9. unstructured_ingest/pipeline/pipeline.py +1 -1
  10. unstructured_ingest/processes/chunker.py +4 -0
  11. unstructured_ingest/processes/connectors/airtable.py +4 -2
  12. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +10 -0
  13. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  14. unstructured_ingest/processes/connectors/astradb.py +2 -2
  15. unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
  16. unstructured_ingest/processes/connectors/confluence.py +0 -1
  17. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
  18. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
  19. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
  20. unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
  21. unstructured_ingest/processes/connectors/delta_table.py +1 -0
  22. unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
  23. unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
  24. unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
  25. unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
  26. unstructured_ingest/processes/connectors/gitlab.py +1 -2
  27. unstructured_ingest/processes/connectors/google_drive.py +0 -2
  28. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
  29. unstructured_ingest/processes/connectors/kdbai.py +1 -0
  30. unstructured_ingest/processes/connectors/outlook.py +1 -2
  31. unstructured_ingest/processes/connectors/pinecone.py +0 -1
  32. unstructured_ingest/processes/connectors/redisdb.py +28 -24
  33. unstructured_ingest/processes/connectors/salesforce.py +1 -1
  34. unstructured_ingest/processes/connectors/slack.py +1 -2
  35. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
  36. unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
  37. unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
  38. unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
  39. unstructured_ingest/processes/connectors/sql/sql.py +3 -4
  40. unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
  41. unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
  42. unstructured_ingest/processes/connectors/vectara.py +0 -2
  43. unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
  44. unstructured_ingest/processes/embedder.py +2 -2
  45. unstructured_ingest/processes/filter.py +1 -1
  46. unstructured_ingest/processes/partitioner.py +4 -0
  47. unstructured_ingest/processes/utils/blob_storage.py +2 -2
  48. unstructured_ingest/unstructured_api.py +13 -8
  49. unstructured_ingest/utils/data_prep.py +8 -32
  50. unstructured_ingest/utils/string_and_date_utils.py +3 -3
  51. unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
  52. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +54 -187
  53. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
  54. examples/__init__.py +0 -0
  55. examples/airtable.py +0 -44
  56. examples/azure_cognitive_search.py +0 -55
  57. examples/chroma.py +0 -54
  58. examples/couchbase.py +0 -55
  59. examples/databricks_volumes_dest.py +0 -55
  60. examples/databricks_volumes_source.py +0 -53
  61. examples/delta_table.py +0 -45
  62. examples/discord_example.py +0 -36
  63. examples/elasticsearch.py +0 -49
  64. examples/google_drive.py +0 -45
  65. examples/kdbai.py +0 -54
  66. examples/local.py +0 -36
  67. examples/milvus.py +0 -44
  68. examples/mongodb.py +0 -53
  69. examples/opensearch.py +0 -50
  70. examples/pinecone.py +0 -57
  71. examples/s3.py +0 -38
  72. examples/salesforce.py +0 -44
  73. examples/sharepoint.py +0 -47
  74. examples/singlestore.py +0 -49
  75. examples/sql.py +0 -90
  76. examples/vectara.py +0 -54
  77. examples/weaviate.py +0 -44
  78. test/__init__.py +0 -0
  79. test/integration/__init__.py +0 -0
  80. test/integration/chunkers/__init__.py +0 -0
  81. test/integration/chunkers/test_chunkers.py +0 -31
  82. test/integration/connectors/__init__.py +0 -0
  83. test/integration/connectors/conftest.py +0 -38
  84. test/integration/connectors/databricks/__init__.py +0 -0
  85. test/integration/connectors/databricks/test_volumes_native.py +0 -273
  86. test/integration/connectors/discord/__init__.py +0 -0
  87. test/integration/connectors/discord/test_discord.py +0 -90
  88. test/integration/connectors/duckdb/__init__.py +0 -0
  89. test/integration/connectors/duckdb/conftest.py +0 -14
  90. test/integration/connectors/duckdb/test_duckdb.py +0 -90
  91. test/integration/connectors/duckdb/test_motherduck.py +0 -95
  92. test/integration/connectors/elasticsearch/__init__.py +0 -0
  93. test/integration/connectors/elasticsearch/conftest.py +0 -34
  94. test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
  95. test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
  96. test/integration/connectors/sql/__init__.py +0 -0
  97. test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
  98. test/integration/connectors/sql/test_postgres.py +0 -201
  99. test/integration/connectors/sql/test_singlestore.py +0 -182
  100. test/integration/connectors/sql/test_snowflake.py +0 -244
  101. test/integration/connectors/sql/test_sqlite.py +0 -168
  102. test/integration/connectors/sql/test_vastdb.py +0 -34
  103. test/integration/connectors/test_astradb.py +0 -287
  104. test/integration/connectors/test_azure_ai_search.py +0 -254
  105. test/integration/connectors/test_chroma.py +0 -136
  106. test/integration/connectors/test_confluence.py +0 -111
  107. test/integration/connectors/test_delta_table.py +0 -183
  108. test/integration/connectors/test_dropbox.py +0 -151
  109. test/integration/connectors/test_github.py +0 -49
  110. test/integration/connectors/test_google_drive.py +0 -257
  111. test/integration/connectors/test_jira.py +0 -67
  112. test/integration/connectors/test_lancedb.py +0 -247
  113. test/integration/connectors/test_milvus.py +0 -208
  114. test/integration/connectors/test_mongodb.py +0 -335
  115. test/integration/connectors/test_neo4j.py +0 -244
  116. test/integration/connectors/test_notion.py +0 -152
  117. test/integration/connectors/test_onedrive.py +0 -163
  118. test/integration/connectors/test_pinecone.py +0 -387
  119. test/integration/connectors/test_qdrant.py +0 -216
  120. test/integration/connectors/test_redis.py +0 -143
  121. test/integration/connectors/test_s3.py +0 -184
  122. test/integration/connectors/test_sharepoint.py +0 -222
  123. test/integration/connectors/test_vectara.py +0 -282
  124. test/integration/connectors/test_zendesk.py +0 -120
  125. test/integration/connectors/utils/__init__.py +0 -0
  126. test/integration/connectors/utils/constants.py +0 -13
  127. test/integration/connectors/utils/docker.py +0 -151
  128. test/integration/connectors/utils/docker_compose.py +0 -59
  129. test/integration/connectors/utils/validation/__init__.py +0 -0
  130. test/integration/connectors/utils/validation/destination.py +0 -77
  131. test/integration/connectors/utils/validation/equality.py +0 -76
  132. test/integration/connectors/utils/validation/source.py +0 -331
  133. test/integration/connectors/utils/validation/utils.py +0 -36
  134. test/integration/connectors/weaviate/__init__.py +0 -0
  135. test/integration/connectors/weaviate/conftest.py +0 -15
  136. test/integration/connectors/weaviate/test_cloud.py +0 -39
  137. test/integration/connectors/weaviate/test_local.py +0 -152
  138. test/integration/embedders/__init__.py +0 -0
  139. test/integration/embedders/conftest.py +0 -13
  140. test/integration/embedders/test_azure_openai.py +0 -57
  141. test/integration/embedders/test_bedrock.py +0 -103
  142. test/integration/embedders/test_huggingface.py +0 -24
  143. test/integration/embedders/test_mixedbread.py +0 -71
  144. test/integration/embedders/test_octoai.py +0 -75
  145. test/integration/embedders/test_openai.py +0 -74
  146. test/integration/embedders/test_togetherai.py +0 -71
  147. test/integration/embedders/test_vertexai.py +0 -63
  148. test/integration/embedders/test_voyageai.py +0 -79
  149. test/integration/embedders/utils.py +0 -66
  150. test/integration/partitioners/__init__.py +0 -0
  151. test/integration/partitioners/test_partitioner.py +0 -76
  152. test/integration/utils.py +0 -15
  153. test/unit/__init__.py +0 -0
  154. test/unit/chunkers/__init__.py +0 -0
  155. test/unit/chunkers/test_chunkers.py +0 -49
  156. test/unit/connectors/__init__.py +0 -0
  157. test/unit/connectors/ibm_watsonx/__init__.py +0 -0
  158. test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
  159. test/unit/connectors/motherduck/__init__.py +0 -0
  160. test/unit/connectors/motherduck/test_base.py +0 -73
  161. test/unit/connectors/sql/__init__.py +0 -0
  162. test/unit/connectors/sql/test_sql.py +0 -152
  163. test/unit/connectors/test_confluence.py +0 -71
  164. test/unit/connectors/test_jira.py +0 -401
  165. test/unit/embed/__init__.py +0 -0
  166. test/unit/embed/test_mixedbreadai.py +0 -42
  167. test/unit/embed/test_octoai.py +0 -27
  168. test/unit/embed/test_openai.py +0 -28
  169. test/unit/embed/test_vertexai.py +0 -25
  170. test/unit/embed/test_voyageai.py +0 -24
  171. test/unit/embedders/__init__.py +0 -0
  172. test/unit/embedders/test_bedrock.py +0 -36
  173. test/unit/embedders/test_huggingface.py +0 -48
  174. test/unit/embedders/test_mixedbread.py +0 -37
  175. test/unit/embedders/test_octoai.py +0 -35
  176. test/unit/embedders/test_openai.py +0 -35
  177. test/unit/embedders/test_togetherai.py +0 -37
  178. test/unit/embedders/test_vertexai.py +0 -37
  179. test/unit/embedders/test_voyageai.py +0 -38
  180. test/unit/partitioners/__init__.py +0 -0
  181. test/unit/partitioners/test_partitioner.py +0 -63
  182. test/unit/test_error.py +0 -27
  183. test/unit/test_html.py +0 -112
  184. test/unit/test_interfaces.py +0 -26
  185. test/unit/test_logger.py +0 -78
  186. test/unit/test_utils.py +0 -220
  187. test/unit/utils/__init__.py +0 -0
  188. test/unit/utils/data_generator.py +0 -32
  189. unstructured_ingest-0.7.1.dist-info/METADATA +0 -383
  190. unstructured_ingest-0.7.1.dist-info/top_level.txt +0 -3
  191. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
  192. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
test/unit/test_utils.py DELETED
@@ -1,220 +0,0 @@
1
- import base64
2
- import json
3
- import zlib
4
- from datetime import datetime
5
- from typing import Any
6
-
7
- import pytest
8
- import pytz
9
- from pydantic import BaseModel, Field, Secret, SecretStr
10
- from pydantic.types import _SecretBase
11
-
12
- from unstructured_ingest.processes.connectors.utils import format_and_truncate_orig_elements
13
- from unstructured_ingest.utils.pydantic_models import (
14
- serialize_base_model,
15
- serialize_base_model_json,
16
- )
17
- from unstructured_ingest.utils.string_and_date_utils import (
18
- ensure_isoformat_datetime,
19
- fix_unescaped_unicode,
20
- json_to_dict,
21
- truncate_string_bytes,
22
- )
23
-
24
- flat_data = {"a": "test", "b": 4, "c": True}
25
-
26
-
27
- def test_json_to_dict_valid_json():
28
- json_string = '{"key": "value"}'
29
- expected_result = {"key": "value"}
30
- assert json_to_dict(json_string) == expected_result
31
- assert isinstance(json_to_dict(json_string), dict)
32
-
33
-
34
- def test_json_to_dict_malformed_json():
35
- json_string = '{"key": "value"'
36
- expected_result = '{"key": "value"'
37
- assert json_to_dict(json_string) == expected_result
38
- assert isinstance(json_to_dict(json_string), str)
39
-
40
-
41
- def test_json_to_dict_single_quotes():
42
- json_string = "{'key': 'value'}"
43
- expected_result = {"key": "value"}
44
- assert json_to_dict(json_string) == expected_result
45
- assert isinstance(json_to_dict(json_string), dict)
46
-
47
-
48
- def test_json_to_dict_path():
49
- json_string = "/path/to/file.json"
50
- expected_result = "/path/to/file.json"
51
- assert json_to_dict(json_string) == expected_result
52
- assert isinstance(json_to_dict(json_string), str)
53
-
54
-
55
- def test_ensure_isoformat_datetime_for_datetime():
56
- dt = ensure_isoformat_datetime(datetime(2021, 1, 1, 12, 0, 0))
57
- assert dt == "2021-01-01T12:00:00"
58
-
59
-
60
- def test_ensure_isoformat_datetime_for_datetime_with_tz():
61
- dt = ensure_isoformat_datetime(datetime(2021, 1, 1, 12, 0, 0, tzinfo=pytz.UTC))
62
- assert dt == "2021-01-01T12:00:00+00:00"
63
-
64
-
65
- def test_ensure_isoformat_datetime_for_string():
66
- dt = ensure_isoformat_datetime("2021-01-01T12:00:00")
67
- assert dt == "2021-01-01T12:00:00"
68
-
69
-
70
- def test_ensure_isoformat_datetime_for_string2():
71
- dt = ensure_isoformat_datetime("2021-01-01T12:00:00+00:00")
72
- assert dt == "2021-01-01T12:00:00+00:00"
73
-
74
-
75
- def test_ensure_isoformat_datetime_fails_on_string():
76
- with pytest.raises(ValueError):
77
- ensure_isoformat_datetime("bad timestamp")
78
-
79
-
80
- def test_ensure_isoformat_datetime_fails_on_int():
81
- with pytest.raises(TypeError):
82
- ensure_isoformat_datetime(1111)
83
-
84
-
85
- def test_truncate_string_bytes_return_truncated_string():
86
- test_string = "abcdef안녕하세요ghijklmn방갑습니opqrstu 더 길어지면 안되는 문자열vwxyz"
87
- max_bytes = 11
88
- result = truncate_string_bytes(test_string, max_bytes)
89
- assert result == "abcdef안"
90
- assert len(result.encode("utf-8")) <= max_bytes
91
-
92
-
93
- def test_truncate_string_bytes_return_untouched_string():
94
- test_string = "abcdef"
95
- max_bytes = 11
96
- result = truncate_string_bytes(test_string, max_bytes)
97
- assert result == "abcdef"
98
- assert len(result.encode("utf-8")) <= max_bytes
99
-
100
-
101
- def test_fix_unescaped_unicode_valid():
102
- text = "This is a test with unescaped unicode: \\u0041"
103
- expected = "This is a test with unescaped unicode: \u0041"
104
- assert fix_unescaped_unicode(text) == expected
105
-
106
-
107
- def test_fix_unescaped_unicode_no_unescaped_chars():
108
- text = "This is a test with no unescaped unicode: \u0041"
109
- expected = "This is a test with no unescaped unicode: \u0041"
110
- assert fix_unescaped_unicode(text) == expected
111
-
112
-
113
- def test_fix_unescaped_unicode_invalid_unicode():
114
- text = "This is a test with invalid unescaped unicode: \\uZZZZ"
115
- expected = "This is a test with invalid unescaped unicode: \\uZZZZ"
116
- assert fix_unescaped_unicode(text) == expected
117
-
118
-
119
- def test_fix_unescaped_unicode_encoding_error(caplog: pytest.LogCaptureFixture):
120
- text = "This is a test with unescaped unicode: \\uD83D"
121
- fix_unescaped_unicode(text)
122
- with caplog.at_level("WARNING"):
123
- fix_unescaped_unicode(text)
124
- assert "Failed to fix unescaped Unicode sequences" in caplog.text
125
-
126
-
127
- class MockChildBaseModel(BaseModel):
128
- child_secret_str: SecretStr
129
- child_secret_float: Secret[float]
130
- child_not_secret_dict: dict[str, Any] = Field(default_factory=dict)
131
-
132
-
133
- class MockBaseModel(BaseModel):
134
- secret_str: SecretStr
135
- not_secret_bool: bool
136
- secret_child_base: Secret[MockChildBaseModel]
137
- not_secret_list: list[int] = Field(default_factory=list)
138
-
139
-
140
- model = MockBaseModel(
141
- secret_str="secret string",
142
- not_secret_bool=False,
143
- secret_child_base=MockChildBaseModel(
144
- child_secret_str="child secret string",
145
- child_secret_float=3.14,
146
- child_not_secret_dict={"key": "value"},
147
- ),
148
- not_secret_list=[1, 2, 3],
149
- )
150
-
151
-
152
- def test_serialize_base_model():
153
-
154
- serialized_dict = model.model_dump()
155
- assert isinstance(serialized_dict["secret_str"], _SecretBase)
156
- assert isinstance(serialized_dict["secret_child_base"], _SecretBase)
157
-
158
- serialized_dict_w_secrets = serialize_base_model(model=model)
159
- assert not isinstance(serialized_dict_w_secrets["secret_str"], _SecretBase)
160
- assert not isinstance(serialized_dict_w_secrets["secret_child_base"], _SecretBase)
161
-
162
- expected_dict = {
163
- "secret_str": "secret string",
164
- "not_secret_bool": False,
165
- "secret_child_base": {
166
- "child_secret_str": "child secret string",
167
- "child_secret_float": 3.14,
168
- "child_not_secret_dict": {"key": "value"},
169
- },
170
- "not_secret_list": [1, 2, 3],
171
- }
172
-
173
- assert serialized_dict_w_secrets == expected_dict
174
-
175
-
176
- def test_serialize_base_model_json():
177
- serialized_json = model.model_dump_json()
178
- serialized_dict = json.loads(serialized_json)
179
- expected_dict = {
180
- "secret_str": "**********",
181
- "not_secret_bool": False,
182
- "secret_child_base": "**********",
183
- "not_secret_list": [1, 2, 3],
184
- }
185
- assert expected_dict == serialized_dict
186
-
187
- serialized_json_w_secrets = serialize_base_model_json(model=model)
188
- serialized_dict_w_secrets = json.loads(serialized_json_w_secrets)
189
- expected_dict_w_secrets = {
190
- "secret_str": "secret string",
191
- "not_secret_bool": False,
192
- "secret_child_base": {
193
- "child_secret_str": "child secret string",
194
- "child_secret_float": 3.14,
195
- "child_not_secret_dict": {"key": "value"},
196
- },
197
- "not_secret_list": [1, 2, 3],
198
- }
199
- assert expected_dict_w_secrets == serialized_dict_w_secrets
200
-
201
-
202
- def test_format_and_truncate_orig_elements():
203
- original_elements = [
204
- {
205
- "text": "Hello, world!",
206
- "metadata": {
207
- "image_base64": "iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAABwUlEQVR42mNk",
208
- "text_as_html": "<p>Hello, world!</p>",
209
- "page": 1,
210
- },
211
- }
212
- ]
213
- json_bytes = json.dumps(original_elements, sort_keys=True).encode("utf-8")
214
- deflated_bytes = zlib.compress(json_bytes)
215
- b64_deflated_bytes = base64.b64encode(deflated_bytes)
216
- b64_deflated_bytes.decode("utf-8")
217
-
218
- assert format_and_truncate_orig_elements(
219
- {"text": "Hello, world!", "metadata": {"orig_elements": b64_deflated_bytes.decode("utf-8")}}
220
- ) == [{"metadata": {"page": 1}}]
File without changes
@@ -1,32 +0,0 @@
1
- import random
2
- from typing import Any, Type
3
-
4
- from faker import Faker
5
-
6
- fake = Faker()
7
-
8
- type_to_random_value_map = {
9
- str: fake.sentence,
10
- int: fake.random_int,
11
- float: fake.random_digit,
12
- bool: fake.boolean,
13
- }
14
- type_to_random_value_map_key = type_to_random_value_map.copy()
15
- type_to_random_value_map_key[str] = fake.word
16
-
17
-
18
- def generate_random_dictionary(key_type: Type = str, value_type: Type = str) -> dict:
19
- d = {}
20
- num_keys = random.randint(1, 3)
21
- for i in range(num_keys):
22
- key = type_to_random_value_map_key[key_type]()
23
- current_value_type = value_type
24
- if current_value_type == Any:
25
- current_value_type = random.choice(list(type_to_random_value_map.keys()) + [dict])
26
- value = (
27
- generate_random_dictionary(key_type=key_type, value_type=value_type)
28
- if current_value_type is dict
29
- else type_to_random_value_map[current_value_type]()
30
- )
31
- d[key] = value
32
- return d
@@ -1,383 +0,0 @@
1
- Metadata-Version: 2.2
2
- Name: unstructured-ingest
3
- Version: 0.7.1
4
- Summary: A library that prepares raw documents for downstream ML tasks.
5
- Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
- Author: Unstructured Technologies
7
- Author-email: devops@unstructuredai.io
8
- License: Apache-2.0
9
- Keywords: NLP PDF HTML CV XML parsing preprocessing
10
- Classifier: Development Status :: 4 - Beta
11
- Classifier: Intended Audience :: Developers
12
- Classifier: Intended Audience :: Education
13
- Classifier: Intended Audience :: Science/Research
14
- Classifier: License :: OSI Approved :: Apache Software License
15
- Classifier: Operating System :: OS Independent
16
- Classifier: Programming Language :: Python :: 3
17
- Classifier: Programming Language :: Python :: 3.9
18
- Classifier: Programming Language :: Python :: 3.10
19
- Classifier: Programming Language :: Python :: 3.11
20
- Classifier: Programming Language :: Python :: 3.12
21
- Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
- Requires-Python: >=3.9.0,<3.14
23
- Description-Content-Type: text/markdown
24
- License-File: LICENSE.md
25
- Requires-Dist: opentelemetry-sdk
26
- Requires-Dist: dataclasses_json
27
- Requires-Dist: click
28
- Requires-Dist: tqdm
29
- Requires-Dist: pydantic>=2.7
30
- Requires-Dist: python-dateutil
31
- Requires-Dist: pandas
32
- Requires-Dist: numpy
33
- Provides-Extra: remote
34
- Requires-Dist: unstructured-client>=0.30.0; extra == "remote"
35
- Requires-Dist: pandas; extra == "remote"
36
- Requires-Dist: numpy; extra == "remote"
37
- Provides-Extra: csv
38
- Requires-Dist: unstructured[tsv]; extra == "csv"
39
- Requires-Dist: pandas; extra == "csv"
40
- Requires-Dist: numpy; extra == "csv"
41
- Provides-Extra: doc
42
- Requires-Dist: unstructured[docx]; extra == "doc"
43
- Requires-Dist: pandas; extra == "doc"
44
- Requires-Dist: numpy; extra == "doc"
45
- Provides-Extra: docx
46
- Requires-Dist: unstructured[docx]; extra == "docx"
47
- Requires-Dist: pandas; extra == "docx"
48
- Requires-Dist: numpy; extra == "docx"
49
- Provides-Extra: epub
50
- Requires-Dist: unstructured[epub]; extra == "epub"
51
- Requires-Dist: pandas; extra == "epub"
52
- Requires-Dist: numpy; extra == "epub"
53
- Provides-Extra: md
54
- Requires-Dist: unstructured[md]; extra == "md"
55
- Requires-Dist: pandas; extra == "md"
56
- Requires-Dist: numpy; extra == "md"
57
- Provides-Extra: msg
58
- Requires-Dist: unstructured[msg]; extra == "msg"
59
- Requires-Dist: pandas; extra == "msg"
60
- Requires-Dist: numpy; extra == "msg"
61
- Provides-Extra: odt
62
- Requires-Dist: unstructured[odt]; extra == "odt"
63
- Requires-Dist: pandas; extra == "odt"
64
- Requires-Dist: numpy; extra == "odt"
65
- Provides-Extra: org
66
- Requires-Dist: unstructured[org]; extra == "org"
67
- Requires-Dist: pandas; extra == "org"
68
- Requires-Dist: numpy; extra == "org"
69
- Provides-Extra: pdf
70
- Requires-Dist: unstructured[pdf]; extra == "pdf"
71
- Requires-Dist: pandas; extra == "pdf"
72
- Requires-Dist: numpy; extra == "pdf"
73
- Provides-Extra: ppt
74
- Requires-Dist: unstructured[pptx]; extra == "ppt"
75
- Requires-Dist: pandas; extra == "ppt"
76
- Requires-Dist: numpy; extra == "ppt"
77
- Provides-Extra: pptx
78
- Requires-Dist: unstructured[pptx]; extra == "pptx"
79
- Requires-Dist: pandas; extra == "pptx"
80
- Requires-Dist: numpy; extra == "pptx"
81
- Provides-Extra: rtf
82
- Requires-Dist: unstructured[rtf]; extra == "rtf"
83
- Requires-Dist: pandas; extra == "rtf"
84
- Requires-Dist: numpy; extra == "rtf"
85
- Provides-Extra: rst
86
- Requires-Dist: unstructured[rst]; extra == "rst"
87
- Requires-Dist: pandas; extra == "rst"
88
- Requires-Dist: numpy; extra == "rst"
89
- Provides-Extra: tsv
90
- Requires-Dist: unstructured[tsv]; extra == "tsv"
91
- Requires-Dist: pandas; extra == "tsv"
92
- Requires-Dist: numpy; extra == "tsv"
93
- Provides-Extra: xlsx
94
- Requires-Dist: unstructured[xlsx]; extra == "xlsx"
95
- Requires-Dist: pandas; extra == "xlsx"
96
- Requires-Dist: numpy; extra == "xlsx"
97
- Provides-Extra: airtable
98
- Requires-Dist: pyairtable; extra == "airtable"
99
- Requires-Dist: pandas; extra == "airtable"
100
- Requires-Dist: numpy; extra == "airtable"
101
- Provides-Extra: astradb
102
- Requires-Dist: astrapy; extra == "astradb"
103
- Requires-Dist: pandas; extra == "astradb"
104
- Requires-Dist: numpy; extra == "astradb"
105
- Provides-Extra: azure
106
- Requires-Dist: fsspec; extra == "azure"
107
- Requires-Dist: adlfs; extra == "azure"
108
- Requires-Dist: pandas; extra == "azure"
109
- Requires-Dist: numpy; extra == "azure"
110
- Provides-Extra: azure-ai-search
111
- Requires-Dist: azure-search-documents; extra == "azure-ai-search"
112
- Requires-Dist: pandas; extra == "azure-ai-search"
113
- Requires-Dist: numpy; extra == "azure-ai-search"
114
- Provides-Extra: biomed
115
- Requires-Dist: bs4; extra == "biomed"
116
- Requires-Dist: requests; extra == "biomed"
117
- Requires-Dist: pandas; extra == "biomed"
118
- Requires-Dist: numpy; extra == "biomed"
119
- Provides-Extra: box
120
- Requires-Dist: fsspec; extra == "box"
121
- Requires-Dist: boxfs; extra == "box"
122
- Requires-Dist: pandas; extra == "box"
123
- Requires-Dist: numpy; extra == "box"
124
- Provides-Extra: chroma
125
- Requires-Dist: chromadb; extra == "chroma"
126
- Requires-Dist: pandas; extra == "chroma"
127
- Requires-Dist: numpy; extra == "chroma"
128
- Provides-Extra: clarifai
129
- Requires-Dist: clarifai; extra == "clarifai"
130
- Requires-Dist: pandas; extra == "clarifai"
131
- Requires-Dist: numpy; extra == "clarifai"
132
- Provides-Extra: confluence
133
- Requires-Dist: requests; extra == "confluence"
134
- Requires-Dist: atlassian-python-api; extra == "confluence"
135
- Requires-Dist: pandas; extra == "confluence"
136
- Requires-Dist: numpy; extra == "confluence"
137
- Provides-Extra: couchbase
138
- Requires-Dist: couchbase; extra == "couchbase"
139
- Requires-Dist: pandas; extra == "couchbase"
140
- Requires-Dist: numpy; extra == "couchbase"
141
- Provides-Extra: delta-table
142
- Requires-Dist: deltalake; extra == "delta-table"
143
- Requires-Dist: boto3; extra == "delta-table"
144
- Requires-Dist: pandas; extra == "delta-table"
145
- Requires-Dist: numpy; extra == "delta-table"
146
- Provides-Extra: discord
147
- Requires-Dist: discord.py; extra == "discord"
148
- Requires-Dist: pandas; extra == "discord"
149
- Requires-Dist: numpy; extra == "discord"
150
- Provides-Extra: dropbox
151
- Requires-Dist: dropboxdrivefs; extra == "dropbox"
152
- Requires-Dist: fsspec; extra == "dropbox"
153
- Requires-Dist: pandas; extra == "dropbox"
154
- Requires-Dist: numpy; extra == "dropbox"
155
- Provides-Extra: duckdb
156
- Requires-Dist: duckdb; extra == "duckdb"
157
- Requires-Dist: pandas; extra == "duckdb"
158
- Requires-Dist: numpy; extra == "duckdb"
159
- Provides-Extra: elasticsearch
160
- Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
161
- Requires-Dist: pandas; extra == "elasticsearch"
162
- Requires-Dist: numpy; extra == "elasticsearch"
163
- Provides-Extra: gcs
164
- Requires-Dist: bs4; extra == "gcs"
165
- Requires-Dist: fsspec; extra == "gcs"
166
- Requires-Dist: gcsfs; extra == "gcs"
167
- Requires-Dist: pandas; extra == "gcs"
168
- Requires-Dist: numpy; extra == "gcs"
169
- Provides-Extra: github
170
- Requires-Dist: requests; extra == "github"
171
- Requires-Dist: pygithub>1.58.0; extra == "github"
172
- Requires-Dist: pandas; extra == "github"
173
- Requires-Dist: numpy; extra == "github"
174
- Provides-Extra: gitlab
175
- Requires-Dist: python-gitlab; extra == "gitlab"
176
- Requires-Dist: pandas; extra == "gitlab"
177
- Requires-Dist: numpy; extra == "gitlab"
178
- Provides-Extra: google-drive
179
- Requires-Dist: google-api-python-client; extra == "google-drive"
180
- Requires-Dist: pandas; extra == "google-drive"
181
- Requires-Dist: numpy; extra == "google-drive"
182
- Provides-Extra: hubspot
183
- Requires-Dist: hubspot-api-client; extra == "hubspot"
184
- Requires-Dist: urllib3; extra == "hubspot"
185
- Requires-Dist: pandas; extra == "hubspot"
186
- Requires-Dist: numpy; extra == "hubspot"
187
- Provides-Extra: ibm-watsonx-s3
188
- Requires-Dist: tenacity; extra == "ibm-watsonx-s3"
189
- Requires-Dist: httpx; extra == "ibm-watsonx-s3"
190
- Requires-Dist: pyarrow; extra == "ibm-watsonx-s3"
191
- Requires-Dist: pyiceberg; extra == "ibm-watsonx-s3"
192
- Requires-Dist: pandas; extra == "ibm-watsonx-s3"
193
- Requires-Dist: numpy; extra == "ibm-watsonx-s3"
194
- Provides-Extra: jira
195
- Requires-Dist: atlassian-python-api; extra == "jira"
196
- Requires-Dist: pandas; extra == "jira"
197
- Requires-Dist: numpy; extra == "jira"
198
- Provides-Extra: kafka
199
- Requires-Dist: confluent-kafka; extra == "kafka"
200
- Requires-Dist: pandas; extra == "kafka"
201
- Requires-Dist: numpy; extra == "kafka"
202
- Provides-Extra: kdbai
203
- Requires-Dist: kdbai-client>=1.4.0; extra == "kdbai"
204
- Requires-Dist: pandas; extra == "kdbai"
205
- Requires-Dist: numpy; extra == "kdbai"
206
- Provides-Extra: lancedb
207
- Requires-Dist: lancedb; extra == "lancedb"
208
- Requires-Dist: pandas; extra == "lancedb"
209
- Requires-Dist: numpy; extra == "lancedb"
210
- Provides-Extra: milvus
211
- Requires-Dist: pymilvus; extra == "milvus"
212
- Requires-Dist: pandas; extra == "milvus"
213
- Requires-Dist: numpy; extra == "milvus"
214
- Provides-Extra: mongodb
215
- Requires-Dist: pymongo; extra == "mongodb"
216
- Requires-Dist: pandas; extra == "mongodb"
217
- Requires-Dist: numpy; extra == "mongodb"
218
- Provides-Extra: neo4j
219
- Requires-Dist: cymple; extra == "neo4j"
220
- Requires-Dist: neo4j-rust-ext; extra == "neo4j"
221
- Requires-Dist: networkx; extra == "neo4j"
222
- Requires-Dist: pandas; extra == "neo4j"
223
- Requires-Dist: numpy; extra == "neo4j"
224
- Provides-Extra: notion
225
- Requires-Dist: notion-client; extra == "notion"
226
- Requires-Dist: backoff; extra == "notion"
227
- Requires-Dist: httpx; extra == "notion"
228
- Requires-Dist: htmlBuilder; extra == "notion"
229
- Requires-Dist: pandas; extra == "notion"
230
- Requires-Dist: numpy; extra == "notion"
231
- Provides-Extra: onedrive
232
- Requires-Dist: msal; extra == "onedrive"
233
- Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
234
- Requires-Dist: requests; extra == "onedrive"
235
- Requires-Dist: pandas; extra == "onedrive"
236
- Requires-Dist: numpy; extra == "onedrive"
237
- Provides-Extra: opensearch
238
- Requires-Dist: opensearch-py; extra == "opensearch"
239
- Requires-Dist: pandas; extra == "opensearch"
240
- Requires-Dist: numpy; extra == "opensearch"
241
- Provides-Extra: outlook
242
- Requires-Dist: msal; extra == "outlook"
243
- Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
244
- Requires-Dist: pandas; extra == "outlook"
245
- Requires-Dist: numpy; extra == "outlook"
246
- Provides-Extra: pinecone
247
- Requires-Dist: pinecone-client>=3.7.1; extra == "pinecone"
248
- Requires-Dist: pandas; extra == "pinecone"
249
- Requires-Dist: numpy; extra == "pinecone"
250
- Provides-Extra: postgres
251
- Requires-Dist: psycopg2-binary; extra == "postgres"
252
- Requires-Dist: pandas; extra == "postgres"
253
- Requires-Dist: numpy; extra == "postgres"
254
- Provides-Extra: qdrant
255
- Requires-Dist: qdrant-client; extra == "qdrant"
256
- Requires-Dist: pandas; extra == "qdrant"
257
- Requires-Dist: numpy; extra == "qdrant"
258
- Provides-Extra: reddit
259
- Requires-Dist: praw; extra == "reddit"
260
- Requires-Dist: pandas; extra == "reddit"
261
- Requires-Dist: numpy; extra == "reddit"
262
- Provides-Extra: redis
263
- Requires-Dist: redis; extra == "redis"
264
- Requires-Dist: pandas; extra == "redis"
265
- Requires-Dist: numpy; extra == "redis"
266
- Provides-Extra: s3
267
- Requires-Dist: fsspec; extra == "s3"
268
- Requires-Dist: s3fs; extra == "s3"
269
- Requires-Dist: pandas; extra == "s3"
270
- Requires-Dist: numpy; extra == "s3"
271
- Provides-Extra: sharepoint
272
- Requires-Dist: msal; extra == "sharepoint"
273
- Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
274
- Requires-Dist: requests; extra == "sharepoint"
275
- Requires-Dist: pandas; extra == "sharepoint"
276
- Requires-Dist: numpy; extra == "sharepoint"
277
- Provides-Extra: salesforce
278
- Requires-Dist: simple-salesforce; extra == "salesforce"
279
- Requires-Dist: pandas; extra == "salesforce"
280
- Requires-Dist: numpy; extra == "salesforce"
281
- Provides-Extra: sftp
282
- Requires-Dist: paramiko; extra == "sftp"
283
- Requires-Dist: fsspec; extra == "sftp"
284
- Requires-Dist: pandas; extra == "sftp"
285
- Requires-Dist: numpy; extra == "sftp"
286
- Provides-Extra: slack
287
- Requires-Dist: slack_sdk[optional]; extra == "slack"
288
- Requires-Dist: pandas; extra == "slack"
289
- Requires-Dist: numpy; extra == "slack"
290
- Provides-Extra: snowflake
291
- Requires-Dist: snowflake-connector-python; extra == "snowflake"
292
- Requires-Dist: psycopg2-binary; extra == "snowflake"
293
- Requires-Dist: pandas; extra == "snowflake"
294
- Requires-Dist: numpy; extra == "snowflake"
295
- Provides-Extra: wikipedia
296
- Requires-Dist: wikipedia; extra == "wikipedia"
297
- Requires-Dist: pandas; extra == "wikipedia"
298
- Requires-Dist: numpy; extra == "wikipedia"
299
- Provides-Extra: weaviate
300
- Requires-Dist: weaviate-client; extra == "weaviate"
301
- Requires-Dist: pandas; extra == "weaviate"
302
- Requires-Dist: numpy; extra == "weaviate"
303
- Provides-Extra: databricks-volumes
304
- Requires-Dist: databricks-sdk; extra == "databricks-volumes"
305
- Requires-Dist: pandas; extra == "databricks-volumes"
306
- Requires-Dist: numpy; extra == "databricks-volumes"
307
- Provides-Extra: databricks-delta-tables
308
- Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
309
- Requires-Dist: pandas; extra == "databricks-delta-tables"
310
- Requires-Dist: numpy; extra == "databricks-delta-tables"
311
- Provides-Extra: singlestore
312
- Requires-Dist: singlestoredb; extra == "singlestore"
313
- Requires-Dist: pandas; extra == "singlestore"
314
- Requires-Dist: numpy; extra == "singlestore"
315
- Provides-Extra: vectara
316
- Requires-Dist: requests; extra == "vectara"
317
- Requires-Dist: httpx; extra == "vectara"
318
- Requires-Dist: aiofiles; extra == "vectara"
319
- Requires-Dist: pandas; extra == "vectara"
320
- Requires-Dist: numpy; extra == "vectara"
321
- Provides-Extra: vastdb
322
- Requires-Dist: ibis; extra == "vastdb"
323
- Requires-Dist: vastdb; extra == "vastdb"
324
- Requires-Dist: pyarrow; extra == "vastdb"
325
- Requires-Dist: pandas; extra == "vastdb"
326
- Requires-Dist: numpy; extra == "vastdb"
327
- Provides-Extra: zendesk
328
- Requires-Dist: bs4; extra == "zendesk"
329
- Requires-Dist: httpx; extra == "zendesk"
330
- Requires-Dist: aiofiles; extra == "zendesk"
331
- Requires-Dist: pandas; extra == "zendesk"
332
- Requires-Dist: numpy; extra == "zendesk"
333
- Provides-Extra: embed-huggingface
334
- Requires-Dist: sentence-transformers; extra == "embed-huggingface"
335
- Requires-Dist: pandas; extra == "embed-huggingface"
336
- Requires-Dist: numpy; extra == "embed-huggingface"
337
- Provides-Extra: embed-octoai
338
- Requires-Dist: tiktoken; extra == "embed-octoai"
339
- Requires-Dist: openai; extra == "embed-octoai"
340
- Requires-Dist: pandas; extra == "embed-octoai"
341
- Requires-Dist: numpy; extra == "embed-octoai"
342
- Provides-Extra: embed-vertexai
343
- Requires-Dist: vertexai; extra == "embed-vertexai"
344
- Requires-Dist: pandas; extra == "embed-vertexai"
345
- Requires-Dist: numpy; extra == "embed-vertexai"
346
- Provides-Extra: embed-voyageai
347
- Requires-Dist: voyageai; extra == "embed-voyageai"
348
- Requires-Dist: pandas; extra == "embed-voyageai"
349
- Requires-Dist: numpy; extra == "embed-voyageai"
350
- Provides-Extra: embed-mixedbreadai
351
- Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
352
- Requires-Dist: pandas; extra == "embed-mixedbreadai"
353
- Requires-Dist: numpy; extra == "embed-mixedbreadai"
354
- Provides-Extra: openai
355
- Requires-Dist: tiktoken; extra == "openai"
356
- Requires-Dist: openai; extra == "openai"
357
- Requires-Dist: pandas; extra == "openai"
358
- Requires-Dist: numpy; extra == "openai"
359
- Provides-Extra: bedrock
360
- Requires-Dist: boto3; extra == "bedrock"
361
- Requires-Dist: aioboto3; extra == "bedrock"
362
- Requires-Dist: pandas; extra == "bedrock"
363
- Requires-Dist: numpy; extra == "bedrock"
364
- Provides-Extra: togetherai
365
- Requires-Dist: together; extra == "togetherai"
366
- Requires-Dist: pandas; extra == "togetherai"
367
- Requires-Dist: numpy; extra == "togetherai"
368
- Dynamic: author
369
- Dynamic: author-email
370
- Dynamic: classifier
371
- Dynamic: description
372
- Dynamic: description-content-type
373
- Dynamic: home-page
374
- Dynamic: keywords
375
- Dynamic: license
376
- Dynamic: provides-extra
377
- Dynamic: requires-dist
378
- Dynamic: requires-python
379
- Dynamic: summary
380
-
381
- # Unstructured Ingest
382
-
383
- For details, see the [Unstructured Ingest overview](https://docs.unstructured.io/ingestion/overview) in the Unstructured documentation.
@@ -1,3 +0,0 @@
1
- examples
2
- test
3
- unstructured_ingest