unstructured-ingest 0.0.23__py3-none-any.whl → 0.0.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/cli/utils/model_conversion.py +3 -3
- unstructured_ingest/v2/processes/connectors/pinecone.py +32 -21
- {unstructured_ingest-0.0.23.dist-info → unstructured_ingest-0.0.24.dist-info}/METADATA +15 -15
- {unstructured_ingest-0.0.23.dist-info → unstructured_ingest-0.0.24.dist-info}/RECORD +9 -9
- {unstructured_ingest-0.0.23.dist-info → unstructured_ingest-0.0.24.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.0.23.dist-info → unstructured_ingest-0.0.24.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.23.dist-info → unstructured_ingest-0.0.24.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.23.dist-info → unstructured_ingest-0.0.24.dist-info}/top_level.txt +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.0.
|
|
1
|
+
__version__ = "0.0.24" # pragma: no cover
|
|
@@ -155,14 +155,14 @@ def _get_type_from_field(field: FieldInfo) -> click.ParamType:
|
|
|
155
155
|
|
|
156
156
|
def get_option_from_field(option_name: str, field_info: FieldInfo) -> Option:
|
|
157
157
|
param_decls = [option_name]
|
|
158
|
-
|
|
158
|
+
help_text = field_info.description or ""
|
|
159
159
|
if examples := field_info.examples:
|
|
160
|
-
|
|
160
|
+
help_text += f" [Examples: {', '.join(examples)}]"
|
|
161
161
|
option_kwargs = {
|
|
162
162
|
"type": _get_type_from_field(field_info),
|
|
163
163
|
"default": get_default_value_from_field(field_info),
|
|
164
164
|
"required": field_info.is_required(),
|
|
165
|
-
"help":
|
|
165
|
+
"help": str(help_text),
|
|
166
166
|
"is_flag": is_boolean_flag(field_info),
|
|
167
167
|
"show_default": field_info.default is not PydanticUndefined,
|
|
168
168
|
}
|
|
@@ -58,20 +58,6 @@ class PineconeConnectionConfig(ConnectionConfig):
|
|
|
58
58
|
return index
|
|
59
59
|
|
|
60
60
|
|
|
61
|
-
class PineconeUploadStagerConfig(UploadStagerConfig):
|
|
62
|
-
pass
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
class PineconeUploaderConfig(UploaderConfig):
|
|
66
|
-
batch_size: Optional[int] = Field(
|
|
67
|
-
default=None,
|
|
68
|
-
description="Optional number of records per batch. Will otherwise limit by size.",
|
|
69
|
-
)
|
|
70
|
-
pool_threads: Optional[int] = Field(
|
|
71
|
-
default=1, description="Optional limit on number of threads to use for upload"
|
|
72
|
-
)
|
|
73
|
-
|
|
74
|
-
|
|
75
61
|
ALLOWED_FIELDS = (
|
|
76
62
|
"element_id",
|
|
77
63
|
"text",
|
|
@@ -86,31 +72,56 @@ ALLOWED_FIELDS = (
|
|
|
86
72
|
"is_continuation",
|
|
87
73
|
"link_urls",
|
|
88
74
|
"link_texts",
|
|
75
|
+
"text_as_html",
|
|
89
76
|
)
|
|
90
77
|
|
|
91
78
|
|
|
79
|
+
class PineconeUploadStagerConfig(UploadStagerConfig):
|
|
80
|
+
metadata_fields: list[str] = Field(
|
|
81
|
+
default=str(ALLOWED_FIELDS),
|
|
82
|
+
description=(
|
|
83
|
+
"which metadata from the source element to map to the payload metadata being sent to "
|
|
84
|
+
"Pinecone."
|
|
85
|
+
),
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class PineconeUploaderConfig(UploaderConfig):
|
|
90
|
+
batch_size: Optional[int] = Field(
|
|
91
|
+
default=None,
|
|
92
|
+
description="Optional number of records per batch. Will otherwise limit by size.",
|
|
93
|
+
)
|
|
94
|
+
pool_threads: Optional[int] = Field(
|
|
95
|
+
default=1, description="Optional limit on number of threads to use for upload"
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
92
99
|
@dataclass
|
|
93
100
|
class PineconeUploadStager(UploadStager):
|
|
94
101
|
upload_stager_config: PineconeUploadStagerConfig = field(
|
|
95
102
|
default_factory=lambda: PineconeUploadStagerConfig()
|
|
96
103
|
)
|
|
97
104
|
|
|
98
|
-
|
|
99
|
-
def conform_dict(element_dict: dict) -> dict:
|
|
105
|
+
def conform_dict(self, element_dict: dict) -> dict:
|
|
100
106
|
embeddings = element_dict.pop("embeddings", None)
|
|
101
107
|
metadata: dict[str, Any] = element_dict.pop("metadata", {})
|
|
102
108
|
data_source = metadata.pop("data_source", {})
|
|
103
109
|
coordinates = metadata.pop("coordinates", {})
|
|
104
|
-
|
|
105
|
-
element_dict
|
|
106
|
-
|
|
107
|
-
|
|
110
|
+
pinecone_metadata = {}
|
|
111
|
+
for possible_meta in [element_dict, metadata, data_source, coordinates]:
|
|
112
|
+
pinecone_metadata.update(
|
|
113
|
+
{
|
|
114
|
+
k: v
|
|
115
|
+
for k, v in possible_meta.items()
|
|
116
|
+
if k in self.upload_stager_config.metadata_fields
|
|
117
|
+
}
|
|
118
|
+
)
|
|
108
119
|
|
|
109
120
|
return {
|
|
110
121
|
"id": str(uuid.uuid4()),
|
|
111
122
|
"values": embeddings,
|
|
112
123
|
"metadata": flatten_dict(
|
|
113
|
-
|
|
124
|
+
pinecone_metadata,
|
|
114
125
|
separator="-",
|
|
115
126
|
flatten_lists=True,
|
|
116
127
|
remove_none=True,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.24
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -23,19 +23,19 @@ Requires-Python: >=3.9.0,<3.13
|
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
25
|
Requires-Dist: pydantic>=2.7
|
|
26
|
-
Requires-Dist:
|
|
26
|
+
Requires-Dist: tqdm
|
|
27
27
|
Requires-Dist: click
|
|
28
|
+
Requires-Dist: python-dateutil
|
|
28
29
|
Requires-Dist: opentelemetry-sdk
|
|
29
30
|
Requires-Dist: pandas
|
|
30
31
|
Requires-Dist: dataclasses-json
|
|
31
|
-
Requires-Dist: tqdm
|
|
32
32
|
Provides-Extra: airtable
|
|
33
33
|
Requires-Dist: pyairtable; extra == "airtable"
|
|
34
34
|
Provides-Extra: astradb
|
|
35
35
|
Requires-Dist: astrapy; extra == "astradb"
|
|
36
36
|
Provides-Extra: azure
|
|
37
|
-
Requires-Dist: fsspec; extra == "azure"
|
|
38
37
|
Requires-Dist: adlfs; extra == "azure"
|
|
38
|
+
Requires-Dist: fsspec; extra == "azure"
|
|
39
39
|
Provides-Extra: azure-cognitive-search
|
|
40
40
|
Requires-Dist: azure-search-documents; extra == "azure-cognitive-search"
|
|
41
41
|
Provides-Extra: bedrock
|
|
@@ -44,8 +44,8 @@ Provides-Extra: biomed
|
|
|
44
44
|
Requires-Dist: bs4; extra == "biomed"
|
|
45
45
|
Requires-Dist: requests; extra == "biomed"
|
|
46
46
|
Provides-Extra: box
|
|
47
|
-
Requires-Dist: fsspec; extra == "box"
|
|
48
47
|
Requires-Dist: boxfs; extra == "box"
|
|
48
|
+
Requires-Dist: fsspec; extra == "box"
|
|
49
49
|
Provides-Extra: chroma
|
|
50
50
|
Requires-Dist: chromadb; extra == "chroma"
|
|
51
51
|
Provides-Extra: clarifai
|
|
@@ -60,8 +60,8 @@ Requires-Dist: unstructured[tsv]; extra == "csv"
|
|
|
60
60
|
Provides-Extra: databricks-volumes
|
|
61
61
|
Requires-Dist: databricks-sdk; extra == "databricks-volumes"
|
|
62
62
|
Provides-Extra: delta-table
|
|
63
|
-
Requires-Dist: fsspec; extra == "delta-table"
|
|
64
63
|
Requires-Dist: deltalake; extra == "delta-table"
|
|
64
|
+
Requires-Dist: fsspec; extra == "delta-table"
|
|
65
65
|
Provides-Extra: discord
|
|
66
66
|
Requires-Dist: discord-py; extra == "discord"
|
|
67
67
|
Provides-Extra: doc
|
|
@@ -69,8 +69,8 @@ Requires-Dist: unstructured[docx]; extra == "doc"
|
|
|
69
69
|
Provides-Extra: docx
|
|
70
70
|
Requires-Dist: unstructured[docx]; extra == "docx"
|
|
71
71
|
Provides-Extra: dropbox
|
|
72
|
-
Requires-Dist: fsspec; extra == "dropbox"
|
|
73
72
|
Requires-Dist: dropboxdrivefs; extra == "dropbox"
|
|
73
|
+
Requires-Dist: fsspec; extra == "dropbox"
|
|
74
74
|
Provides-Extra: elasticsearch
|
|
75
75
|
Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
|
|
76
76
|
Provides-Extra: embed-huggingface
|
|
@@ -87,19 +87,19 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
|
|
|
87
87
|
Provides-Extra: epub
|
|
88
88
|
Requires-Dist: unstructured[epub]; extra == "epub"
|
|
89
89
|
Provides-Extra: gcs
|
|
90
|
-
Requires-Dist: bs4; extra == "gcs"
|
|
91
|
-
Requires-Dist: fsspec; extra == "gcs"
|
|
92
90
|
Requires-Dist: gcsfs; extra == "gcs"
|
|
91
|
+
Requires-Dist: fsspec; extra == "gcs"
|
|
92
|
+
Requires-Dist: bs4; extra == "gcs"
|
|
93
93
|
Provides-Extra: github
|
|
94
|
-
Requires-Dist: requests; extra == "github"
|
|
95
94
|
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
95
|
+
Requires-Dist: requests; extra == "github"
|
|
96
96
|
Provides-Extra: gitlab
|
|
97
97
|
Requires-Dist: python-gitlab; extra == "gitlab"
|
|
98
98
|
Provides-Extra: google-drive
|
|
99
99
|
Requires-Dist: google-api-python-client; extra == "google-drive"
|
|
100
100
|
Provides-Extra: hubspot
|
|
101
|
-
Requires-Dist: urllib3; extra == "hubspot"
|
|
102
101
|
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
102
|
+
Requires-Dist: urllib3; extra == "hubspot"
|
|
103
103
|
Provides-Extra: jira
|
|
104
104
|
Requires-Dist: atlassian-python-api; extra == "jira"
|
|
105
105
|
Provides-Extra: kafka
|
|
@@ -115,16 +115,16 @@ Requires-Dist: pymongo; extra == "mongodb"
|
|
|
115
115
|
Provides-Extra: msg
|
|
116
116
|
Requires-Dist: unstructured[msg]; extra == "msg"
|
|
117
117
|
Provides-Extra: notion
|
|
118
|
+
Requires-Dist: notion-client; extra == "notion"
|
|
118
119
|
Requires-Dist: backoff; extra == "notion"
|
|
119
|
-
Requires-Dist: httpx; extra == "notion"
|
|
120
120
|
Requires-Dist: htmlBuilder; extra == "notion"
|
|
121
|
-
Requires-Dist:
|
|
121
|
+
Requires-Dist: httpx; extra == "notion"
|
|
122
122
|
Provides-Extra: odt
|
|
123
123
|
Requires-Dist: unstructured[odt]; extra == "odt"
|
|
124
124
|
Provides-Extra: onedrive
|
|
125
|
-
Requires-Dist: bs4; extra == "onedrive"
|
|
126
125
|
Requires-Dist: msal; extra == "onedrive"
|
|
127
126
|
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
127
|
+
Requires-Dist: bs4; extra == "onedrive"
|
|
128
128
|
Provides-Extra: openai
|
|
129
129
|
Requires-Dist: openai; extra == "openai"
|
|
130
130
|
Requires-Dist: tiktoken; extra == "openai"
|
|
@@ -156,8 +156,8 @@ Requires-Dist: unstructured[rst]; extra == "rst"
|
|
|
156
156
|
Provides-Extra: rtf
|
|
157
157
|
Requires-Dist: unstructured[rtf]; extra == "rtf"
|
|
158
158
|
Provides-Extra: s3
|
|
159
|
-
Requires-Dist: fsspec; extra == "s3"
|
|
160
159
|
Requires-Dist: s3fs; extra == "s3"
|
|
160
|
+
Requires-Dist: fsspec; extra == "s3"
|
|
161
161
|
Provides-Extra: salesforce
|
|
162
162
|
Requires-Dist: simple-salesforce; extra == "salesforce"
|
|
163
163
|
Provides-Extra: sftp
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
2
|
-
unstructured_ingest/__version__.py,sha256=
|
|
2
|
+
unstructured_ingest/__version__.py,sha256=i77-gjXpw3EQpetJm6qwuhTR53KoBsdCYSBjHDaGJUQ,43
|
|
3
3
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
4
4
|
unstructured_ingest/interfaces.py,sha256=0r0gQoHJQ4DVSQEVbUPBA3N6WyvGMkR1u6U2SwUvoAQ,31361
|
|
5
5
|
unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
|
|
@@ -277,7 +277,7 @@ unstructured_ingest/v2/cli/base/importer.py,sha256=nRt0QQ3qpi264-n_mR0l55C2ddM8n
|
|
|
277
277
|
unstructured_ingest/v2/cli/base/src.py,sha256=cpQ43qQju4e5s_YSaPxUtA70BaisRkTBdjtlPhqn5Mg,2872
|
|
278
278
|
unstructured_ingest/v2/cli/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
279
279
|
unstructured_ingest/v2/cli/utils/click.py,sha256=Wn2s3PuvBCKB0lsK-W7X_Y0eYyWnS6Y9wWo1OhVBOzY,6344
|
|
280
|
-
unstructured_ingest/v2/cli/utils/model_conversion.py,sha256=
|
|
280
|
+
unstructured_ingest/v2/cli/utils/model_conversion.py,sha256=uJQKpbTC5ysOdVaRq2SWEjG8btBimVZYzX9NVL7xnzs,7500
|
|
281
281
|
unstructured_ingest/v2/interfaces/__init__.py,sha256=Rfa8crx6De7WNOK-EjsWWwFVpsUfCc6gY8B8tQ3ae9I,899
|
|
282
282
|
unstructured_ingest/v2/interfaces/connector.py,sha256=KG0pHdAcpuO5h72xrAkJzADmjxbav31TZ2Wo3PBvwT0,765
|
|
283
283
|
unstructured_ingest/v2/interfaces/downloader.py,sha256=PKT1kr79Mz1urW_8xCyq9sBuK93gDvyTXg5e4ma4htU,2871
|
|
@@ -323,7 +323,7 @@ unstructured_ingest/v2/processes/connectors/milvus.py,sha256=ZUlyAQyTt0U1JoapFYH
|
|
|
323
323
|
unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=2_R_hrEAaTU4vJTCK9oKblWTgv6BKjyUhFtC7uq3q2w,4859
|
|
324
324
|
unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=ZiUo-dFo1LMOvFwphSLRZiR1PcrN8GWLTHhsh4TU6n0,9207
|
|
325
325
|
unstructured_ingest/v2/processes/connectors/opensearch.py,sha256=dfDSNrWIEk19wuHdlMJpp_SLMOteNPlkDBPlAwu1LVY,6767
|
|
326
|
-
unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=
|
|
326
|
+
unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=8St-JaVrDdQEVZpRS_TfjFusfjg0bAg3IYyykGFyWdw,7169
|
|
327
327
|
unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
|
|
328
328
|
unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=hOaV5gBcHFc6N5Rbu3MgM-5Aol1ht-QkNIN4PqjvfxE,19665
|
|
329
329
|
unstructured_ingest/v2/processes/connectors/singlestore.py,sha256=4rVvWKK2iQr03Ff6cB5zjfE1MpN0JyIGpCxxFCDI6hc,5563
|
|
@@ -339,9 +339,9 @@ unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=-_pYHbsBG9FyRyN
|
|
|
339
339
|
unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=je1BDqFWlyMfPa4oAMMNFQLLQtCY9quuqx3xjTwF8OQ,6251
|
|
340
340
|
unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=dwpyqDq0qceCBWX3zM1hiUlgXB4hzX6ObOr-sh-5CJs,6926
|
|
341
341
|
unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
|
|
342
|
-
unstructured_ingest-0.0.
|
|
343
|
-
unstructured_ingest-0.0.
|
|
344
|
-
unstructured_ingest-0.0.
|
|
345
|
-
unstructured_ingest-0.0.
|
|
346
|
-
unstructured_ingest-0.0.
|
|
347
|
-
unstructured_ingest-0.0.
|
|
342
|
+
unstructured_ingest-0.0.24.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
343
|
+
unstructured_ingest-0.0.24.dist-info/METADATA,sha256=rHTF8fy1vNg5NmCBNVdobYWeGgpn_PBKao2z54UbgnE,7108
|
|
344
|
+
unstructured_ingest-0.0.24.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
|
345
|
+
unstructured_ingest-0.0.24.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
346
|
+
unstructured_ingest-0.0.24.dist-info/top_level.txt,sha256=QaTxTcjfM5Hr9sZJ6weOJvSe5ESQc0F8AWkhHInTCf8,20
|
|
347
|
+
unstructured_ingest-0.0.24.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-0.0.23.dist-info → unstructured_ingest-0.0.24.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|