unstructured-ingest 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cli.py +6 -1
- unstructured_ingest/cli/cmds/__init__.py +4 -4
- unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
- unstructured_ingest/cli/interfaces.py +13 -6
- unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
- unstructured_ingest/connector/biomed.py +12 -5
- unstructured_ingest/connector/confluence.py +3 -3
- unstructured_ingest/connector/github.py +3 -2
- unstructured_ingest/connector/google_drive.py +1 -2
- unstructured_ingest/connector/mongodb.py +1 -2
- unstructured_ingest/connector/notion/client.py +31 -16
- unstructured_ingest/connector/notion/connector.py +3 -2
- unstructured_ingest/connector/registry.py +2 -2
- unstructured_ingest/connector/vectara.py +7 -2
- unstructured_ingest/interfaces.py +13 -9
- unstructured_ingest/pipeline/interfaces.py +8 -3
- unstructured_ingest/pipeline/reformat/chunking.py +13 -9
- unstructured_ingest/pipeline/reformat/embedding.py +3 -3
- unstructured_ingest/runner/__init__.py +2 -2
- unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
- unstructured_ingest/runner/writers/__init__.py +2 -2
- unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
- unstructured_ingest/utils/chunking.py +45 -0
- unstructured_ingest/utils/dep_check.py +1 -1
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/v2/cli/base/cmd.py +57 -13
- unstructured_ingest/v2/cli/base/dest.py +21 -12
- unstructured_ingest/v2/cli/base/src.py +35 -23
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
- unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
- unstructured_ingest/v2/interfaces/connector.py +5 -7
- unstructured_ingest/v2/interfaces/downloader.py +8 -5
- unstructured_ingest/v2/interfaces/file_data.py +8 -2
- unstructured_ingest/v2/interfaces/indexer.py +3 -4
- unstructured_ingest/v2/interfaces/processor.py +10 -10
- unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
- unstructured_ingest/v2/interfaces/uploader.py +3 -3
- unstructured_ingest/v2/pipeline/pipeline.py +9 -6
- unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
- unstructured_ingest/v2/pipeline/steps/download.py +13 -11
- unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
- unstructured_ingest/v2/pipeline/steps/filter.py +1 -6
- unstructured_ingest/v2/pipeline/steps/index.py +14 -10
- unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
- unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
- unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +74 -28
- unstructured_ingest/v2/processes/connector_registry.py +8 -2
- unstructured_ingest/v2/processes/connectors/__init__.py +18 -3
- unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +46 -39
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +30 -27
- unstructured_ingest/v2/processes/connectors/chroma.py +30 -21
- unstructured_ingest/v2/processes/connectors/couchbase.py +333 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +87 -32
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +70 -45
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +39 -16
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +15 -13
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +10 -11
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +20 -34
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +38 -13
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +31 -17
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -28
- unstructured_ingest/v2/processes/connectors/google_drive.py +40 -34
- unstructured_ingest/v2/processes/connectors/kdbai.py +170 -0
- unstructured_ingest/v2/processes/connectors/local.py +27 -16
- unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
- unstructured_ingest/v2/processes/connectors/mongodb.py +22 -18
- unstructured_ingest/v2/processes/connectors/onedrive.py +17 -14
- unstructured_ingest/v2/processes/connectors/opensearch.py +66 -56
- unstructured_ingest/v2/processes/connectors/pinecone.py +22 -21
- unstructured_ingest/v2/processes/connectors/salesforce.py +26 -18
- unstructured_ingest/v2/processes/connectors/sharepoint.py +51 -26
- unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
- unstructured_ingest/v2/processes/connectors/sql.py +29 -31
- unstructured_ingest/v2/processes/connectors/weaviate.py +22 -13
- unstructured_ingest/v2/processes/embedder.py +106 -47
- unstructured_ingest/v2/processes/filter.py +11 -5
- unstructured_ingest/v2/processes/partitioner.py +79 -33
- unstructured_ingest/v2/processes/uncompress.py +3 -3
- unstructured_ingest/v2/utils.py +45 -0
- unstructured_ingest-0.0.5.dist-info/LICENSE.md +201 -0
- unstructured_ingest-0.0.5.dist-info/METADATA +574 -0
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/RECORD +91 -116
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/WHEEL +1 -1
- unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
- unstructured_ingest/v2/cli/cmds/astra.py +0 -85
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
- unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -69
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
- unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
- unstructured_ingest/v2/cli/cmds/local.py +0 -52
- unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
- unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
- unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
- unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
- unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
- unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
- unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
- unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
- unstructured_ingest/v2/cli/cmds/sql.py +0 -84
- unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
- unstructured_ingest/v2/cli/configs/__init__.py +0 -13
- unstructured_ingest/v2/cli/configs/chunk.py +0 -89
- unstructured_ingest/v2/cli/configs/embed.py +0 -74
- unstructured_ingest/v2/cli/configs/filter.py +0 -28
- unstructured_ingest/v2/cli/configs/partition.py +0 -99
- unstructured_ingest/v2/cli/configs/processor.py +0 -88
- unstructured_ingest/v2/cli/interfaces.py +0 -27
- unstructured_ingest/v2/pipeline/utils.py +0 -15
- unstructured_ingest-0.0.3.dist-info/METADATA +0 -175
- /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/top_level.txt +0 -0
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.base import DestCmd
|
|
6
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
7
|
-
from unstructured_ingest.v2.processes.connectors.pinecone import CONNECTOR_TYPE
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
@dataclass
|
|
11
|
-
class PineconeCliConnectionConfig(CliConfig):
|
|
12
|
-
@staticmethod
|
|
13
|
-
def get_cli_options() -> list[click.Option]:
|
|
14
|
-
options = [
|
|
15
|
-
click.Option(
|
|
16
|
-
["--api-key"],
|
|
17
|
-
required=True,
|
|
18
|
-
type=str,
|
|
19
|
-
help="API key for Pinecone.",
|
|
20
|
-
),
|
|
21
|
-
click.Option(
|
|
22
|
-
["--index-name"],
|
|
23
|
-
required=True,
|
|
24
|
-
type=str,
|
|
25
|
-
help="Name of the index to connect to. Example: my-index",
|
|
26
|
-
),
|
|
27
|
-
click.Option(
|
|
28
|
-
["--environment"],
|
|
29
|
-
required=True,
|
|
30
|
-
type=str,
|
|
31
|
-
help="Environment to connect to. Example: us-east-1",
|
|
32
|
-
),
|
|
33
|
-
]
|
|
34
|
-
return options
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
@dataclass
|
|
38
|
-
class PineconeCliUploaderConfig(CliConfig):
|
|
39
|
-
@staticmethod
|
|
40
|
-
def get_cli_options() -> list[click.Option]:
|
|
41
|
-
options = [
|
|
42
|
-
click.Option(
|
|
43
|
-
["--batch-size"],
|
|
44
|
-
default=100,
|
|
45
|
-
type=int,
|
|
46
|
-
help="Number of records per batch",
|
|
47
|
-
),
|
|
48
|
-
click.Option(
|
|
49
|
-
["--num-processes"],
|
|
50
|
-
default=4,
|
|
51
|
-
type=int,
|
|
52
|
-
help="Number of processes to use for uploading",
|
|
53
|
-
),
|
|
54
|
-
]
|
|
55
|
-
return options
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
pinecone_dest_cmd = DestCmd(
|
|
59
|
-
cmd_name=CONNECTOR_TYPE,
|
|
60
|
-
connection_config=PineconeCliConnectionConfig,
|
|
61
|
-
uploader_config=PineconeCliUploaderConfig,
|
|
62
|
-
)
|
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.base import SrcCmd
|
|
6
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
7
|
-
from unstructured_ingest.v2.cli.utils import DelimitedString
|
|
8
|
-
from unstructured_ingest.v2.processes.connectors.salesforce import (
|
|
9
|
-
ACCEPTED_CATEGORIES,
|
|
10
|
-
CONNECTOR_TYPE,
|
|
11
|
-
)
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
@dataclass
|
|
15
|
-
class SalesforceCliConnectionConfig(CliConfig):
|
|
16
|
-
@staticmethod
|
|
17
|
-
def get_cli_options() -> list[click.Option]:
|
|
18
|
-
options = [
|
|
19
|
-
click.Option(
|
|
20
|
-
["--username"],
|
|
21
|
-
required=True,
|
|
22
|
-
type=str,
|
|
23
|
-
help="Salesforce username usually looks like an email.",
|
|
24
|
-
),
|
|
25
|
-
click.Option(
|
|
26
|
-
["--consumer-key"],
|
|
27
|
-
required=True,
|
|
28
|
-
type=str,
|
|
29
|
-
help="For the Salesforce JWT auth. Found in Consumer Details.",
|
|
30
|
-
),
|
|
31
|
-
click.Option(
|
|
32
|
-
["--private-key"],
|
|
33
|
-
required=True,
|
|
34
|
-
type=str,
|
|
35
|
-
help="Path to the private key or its contents for the Salesforce JWT auth. "
|
|
36
|
-
"Key file is usually named server.key.",
|
|
37
|
-
),
|
|
38
|
-
]
|
|
39
|
-
return options
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
@dataclass
|
|
43
|
-
class SalesforceCliIndexerConfig(CliConfig):
|
|
44
|
-
@staticmethod
|
|
45
|
-
def get_cli_options() -> list[click.Option]:
|
|
46
|
-
possible_categories = ACCEPTED_CATEGORIES
|
|
47
|
-
options = [
|
|
48
|
-
click.Option(
|
|
49
|
-
["--categories"],
|
|
50
|
-
default=None,
|
|
51
|
-
required=True,
|
|
52
|
-
type=DelimitedString(choices=possible_categories),
|
|
53
|
-
help="Comma-delimited salesforce categories to download. "
|
|
54
|
-
"Currently only {}.".format(", ".join(possible_categories)),
|
|
55
|
-
),
|
|
56
|
-
]
|
|
57
|
-
return options
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
@dataclass
|
|
61
|
-
class SalesforceCliDownloadConfig(CliConfig):
|
|
62
|
-
@staticmethod
|
|
63
|
-
def get_cli_options() -> list[click.Option]:
|
|
64
|
-
options = [
|
|
65
|
-
click.Option(
|
|
66
|
-
["--download-dir"],
|
|
67
|
-
help="Where files are downloaded to, defaults to a location at"
|
|
68
|
-
"`$HOME/.cache/unstructured_ingest/<connector name>/<SHA256>`.",
|
|
69
|
-
),
|
|
70
|
-
]
|
|
71
|
-
return options
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
salesforce_src_cmd = SrcCmd(
|
|
75
|
-
cmd_name=CONNECTOR_TYPE,
|
|
76
|
-
connection_config=SalesforceCliConnectionConfig,
|
|
77
|
-
indexer_config=SalesforceCliIndexerConfig,
|
|
78
|
-
downloader_config=SalesforceCliDownloadConfig,
|
|
79
|
-
)
|
|
@@ -1,112 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.base import SrcCmd
|
|
6
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
7
|
-
from unstructured_ingest.v2.processes.connectors.sharepoint import CONNECTOR_TYPE
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
@dataclass
|
|
11
|
-
class SharepointCliConnectionConfig(CliConfig):
|
|
12
|
-
@staticmethod
|
|
13
|
-
def get_cli_options() -> list[click.Option]:
|
|
14
|
-
options = [
|
|
15
|
-
click.Option(
|
|
16
|
-
["--client-id"],
|
|
17
|
-
default=None,
|
|
18
|
-
type=str,
|
|
19
|
-
help="Sharepoint app client ID",
|
|
20
|
-
),
|
|
21
|
-
click.Option(
|
|
22
|
-
["--client-cred"],
|
|
23
|
-
default=None,
|
|
24
|
-
type=str,
|
|
25
|
-
help="Sharepoint app secret",
|
|
26
|
-
),
|
|
27
|
-
click.Option(
|
|
28
|
-
["--site"],
|
|
29
|
-
default=None,
|
|
30
|
-
type=str,
|
|
31
|
-
help="Sharepoint site url. Process either base url e.g \
|
|
32
|
-
https://[tenant].sharepoint.com or relative sites \
|
|
33
|
-
https://[tenant].sharepoint.com/sites/<site_name>. \
|
|
34
|
-
To process all sites within the tenant pass a site url as \
|
|
35
|
-
https://[tenant]-admin.sharepoint.com.\
|
|
36
|
-
This requires the app to be registered at a tenant level",
|
|
37
|
-
),
|
|
38
|
-
click.Option(
|
|
39
|
-
["--permissions-application-id"],
|
|
40
|
-
type=str,
|
|
41
|
-
help="Microsoft Graph API application id",
|
|
42
|
-
),
|
|
43
|
-
click.Option(
|
|
44
|
-
["--permissions-client-cred"],
|
|
45
|
-
type=str,
|
|
46
|
-
help="Microsoft Graph API application credentials",
|
|
47
|
-
),
|
|
48
|
-
click.Option(
|
|
49
|
-
["--permissions-tenant"],
|
|
50
|
-
type=str,
|
|
51
|
-
help="e.g https://contoso.onmicrosoft.com to get permissions data within tenant.",
|
|
52
|
-
),
|
|
53
|
-
]
|
|
54
|
-
return options
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
@dataclass
|
|
58
|
-
class SharepointCliIndexerConfig(CliConfig):
|
|
59
|
-
@staticmethod
|
|
60
|
-
def get_cli_options() -> list[click.Option]:
|
|
61
|
-
options = [
|
|
62
|
-
click.Option(
|
|
63
|
-
["--path"],
|
|
64
|
-
default=None,
|
|
65
|
-
type=str,
|
|
66
|
-
help="Path from which to start parsing files. If the connector is to \
|
|
67
|
-
process all sites within the tenant this filter will be applied to \
|
|
68
|
-
all sites document libraries.",
|
|
69
|
-
),
|
|
70
|
-
click.Option(
|
|
71
|
-
["--recursive"],
|
|
72
|
-
is_flag=True,
|
|
73
|
-
default=False,
|
|
74
|
-
help="Recursively download files in their respective folders "
|
|
75
|
-
"otherwise stop at the files in provided folder level.",
|
|
76
|
-
),
|
|
77
|
-
click.Option(
|
|
78
|
-
["--omit-files"],
|
|
79
|
-
is_flag=True,
|
|
80
|
-
default=False,
|
|
81
|
-
help="Don't process files.",
|
|
82
|
-
),
|
|
83
|
-
click.Option(
|
|
84
|
-
["--omit-pages"],
|
|
85
|
-
is_flag=True,
|
|
86
|
-
default=False,
|
|
87
|
-
help="Don't process site pages.",
|
|
88
|
-
),
|
|
89
|
-
]
|
|
90
|
-
return options
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
@dataclass
|
|
94
|
-
class SharepointCliDownloadConfig(CliConfig):
|
|
95
|
-
@staticmethod
|
|
96
|
-
def get_cli_options() -> list[click.Option]:
|
|
97
|
-
options = [
|
|
98
|
-
click.Option(
|
|
99
|
-
["--download-dir"],
|
|
100
|
-
help="Where files are downloaded to, defaults to a location at"
|
|
101
|
-
"`$HOME/.cache/unstructured_ingest/<connector name>/<SHA256>`.",
|
|
102
|
-
),
|
|
103
|
-
]
|
|
104
|
-
return options
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
sharepoint_drive_src_cmd = SrcCmd(
|
|
108
|
-
cmd_name=CONNECTOR_TYPE,
|
|
109
|
-
connection_config=SharepointCliConnectionConfig,
|
|
110
|
-
indexer_config=SharepointCliIndexerConfig,
|
|
111
|
-
downloader_config=SharepointCliDownloadConfig,
|
|
112
|
-
)
|
|
@@ -1,96 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.base import DestCmd
|
|
6
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
7
|
-
from unstructured_ingest.v2.processes.connectors.singlestore import CONNECTOR_TYPE
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
@dataclass
|
|
11
|
-
class SingleStoreCliConnectionConfig(CliConfig):
|
|
12
|
-
@staticmethod
|
|
13
|
-
def get_cli_options() -> list[click.Option]:
|
|
14
|
-
options = [
|
|
15
|
-
click.Option(
|
|
16
|
-
["--host"],
|
|
17
|
-
required=False,
|
|
18
|
-
type=str,
|
|
19
|
-
default=None,
|
|
20
|
-
help="SingleStore host",
|
|
21
|
-
),
|
|
22
|
-
click.Option(
|
|
23
|
-
["--port"],
|
|
24
|
-
required=False,
|
|
25
|
-
type=int,
|
|
26
|
-
default=None,
|
|
27
|
-
help="SingleStore port",
|
|
28
|
-
),
|
|
29
|
-
click.Option(
|
|
30
|
-
["--user"],
|
|
31
|
-
required=False,
|
|
32
|
-
type=str,
|
|
33
|
-
default=None,
|
|
34
|
-
help="SingleStore user",
|
|
35
|
-
),
|
|
36
|
-
click.Option(
|
|
37
|
-
["--password"],
|
|
38
|
-
required=False,
|
|
39
|
-
type=str,
|
|
40
|
-
default=None,
|
|
41
|
-
help="SingleStore password",
|
|
42
|
-
),
|
|
43
|
-
click.Option(
|
|
44
|
-
["--database"],
|
|
45
|
-
required=False,
|
|
46
|
-
type=str,
|
|
47
|
-
default=None,
|
|
48
|
-
help="SingleStore database",
|
|
49
|
-
),
|
|
50
|
-
]
|
|
51
|
-
return options
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
@dataclass
|
|
55
|
-
class SingleStoreCliUploaderConfig(CliConfig):
|
|
56
|
-
@staticmethod
|
|
57
|
-
def get_cli_options() -> list[click.Option]:
|
|
58
|
-
options = [
|
|
59
|
-
click.Option(
|
|
60
|
-
["--drop-empty-cols"],
|
|
61
|
-
required=False,
|
|
62
|
-
type=bool,
|
|
63
|
-
is_flag=True,
|
|
64
|
-
default=False,
|
|
65
|
-
help="Drop any columns that have no data",
|
|
66
|
-
),
|
|
67
|
-
]
|
|
68
|
-
return options
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
@dataclass
|
|
72
|
-
class SingleStoreCliUploadStagerConfig(CliConfig):
|
|
73
|
-
@staticmethod
|
|
74
|
-
def get_cli_options() -> list[click.Option]:
|
|
75
|
-
return [
|
|
76
|
-
click.Option(
|
|
77
|
-
["--table-name"],
|
|
78
|
-
required=False,
|
|
79
|
-
type=str,
|
|
80
|
-
help="SingleStore table to write contents to",
|
|
81
|
-
),
|
|
82
|
-
click.Option(
|
|
83
|
-
["--batch-size"],
|
|
84
|
-
required=False,
|
|
85
|
-
type=click.IntRange(min=1),
|
|
86
|
-
help="Batch size when writing to SingleStore",
|
|
87
|
-
),
|
|
88
|
-
]
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
singlestore_dest_cmd = DestCmd(
|
|
92
|
-
cmd_name=CONNECTOR_TYPE,
|
|
93
|
-
connection_config=SingleStoreCliConnectionConfig,
|
|
94
|
-
uploader_config=SingleStoreCliUploaderConfig,
|
|
95
|
-
upload_stager_config=SingleStoreCliUploadStagerConfig,
|
|
96
|
-
)
|
|
@@ -1,84 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.base import DestCmd
|
|
6
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
7
|
-
from unstructured_ingest.v2.processes.connectors.sql import CONNECTOR_TYPE
|
|
8
|
-
|
|
9
|
-
SQL_DRIVERS = {"postgresql", "sqlite"}
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
@dataclass
|
|
13
|
-
class SQLCliConnectionConfig(CliConfig):
|
|
14
|
-
@staticmethod
|
|
15
|
-
def get_cli_options() -> list[click.Option]:
|
|
16
|
-
options = [
|
|
17
|
-
click.Option(
|
|
18
|
-
["--db-type"],
|
|
19
|
-
required=True,
|
|
20
|
-
type=click.Choice(SQL_DRIVERS),
|
|
21
|
-
help="Type of the database backend",
|
|
22
|
-
),
|
|
23
|
-
click.Option(
|
|
24
|
-
["--username"],
|
|
25
|
-
default=None,
|
|
26
|
-
type=str,
|
|
27
|
-
help="DB username",
|
|
28
|
-
),
|
|
29
|
-
click.Option(
|
|
30
|
-
["--password"],
|
|
31
|
-
default=None,
|
|
32
|
-
type=str,
|
|
33
|
-
help="DB password",
|
|
34
|
-
),
|
|
35
|
-
click.Option(
|
|
36
|
-
["--host"],
|
|
37
|
-
default=None,
|
|
38
|
-
type=str,
|
|
39
|
-
help="DB host",
|
|
40
|
-
),
|
|
41
|
-
click.Option(
|
|
42
|
-
["--port"],
|
|
43
|
-
default=None,
|
|
44
|
-
type=int,
|
|
45
|
-
help="DB host connection port",
|
|
46
|
-
),
|
|
47
|
-
click.Option(
|
|
48
|
-
["--database"],
|
|
49
|
-
default=None,
|
|
50
|
-
type=str,
|
|
51
|
-
help="Database name. For sqlite databases, this is the path to the .db file.",
|
|
52
|
-
),
|
|
53
|
-
]
|
|
54
|
-
return options
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
@dataclass
|
|
58
|
-
class SQLCliUploaderConfig(CliConfig):
|
|
59
|
-
@staticmethod
|
|
60
|
-
def get_cli_options() -> list[click.Option]:
|
|
61
|
-
options = [
|
|
62
|
-
click.Option(
|
|
63
|
-
["--batch-size"],
|
|
64
|
-
default=100,
|
|
65
|
-
type=int,
|
|
66
|
-
help="Number of records per batch",
|
|
67
|
-
)
|
|
68
|
-
]
|
|
69
|
-
return options
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
@dataclass
|
|
73
|
-
class SQLCliUploadStagerConfig(CliConfig):
|
|
74
|
-
@staticmethod
|
|
75
|
-
def get_cli_options() -> list[click.Option]:
|
|
76
|
-
return []
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
sql_dest_cmd = DestCmd(
|
|
80
|
-
cmd_name=CONNECTOR_TYPE,
|
|
81
|
-
connection_config=SQLCliConnectionConfig,
|
|
82
|
-
uploader_config=SQLCliUploaderConfig,
|
|
83
|
-
upload_stager_config=SQLCliUploadStagerConfig,
|
|
84
|
-
)
|
|
@@ -1,100 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.base import DestCmd
|
|
6
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
7
|
-
from unstructured_ingest.v2.cli.utils import DelimitedString
|
|
8
|
-
from unstructured_ingest.v2.processes.connectors.weaviate import CONNECTOR_TYPE
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
@dataclass
|
|
12
|
-
class WeaviateCliConnectionConfig(CliConfig):
|
|
13
|
-
@staticmethod
|
|
14
|
-
def get_cli_options() -> list[click.Option]:
|
|
15
|
-
options = [
|
|
16
|
-
click.Option(
|
|
17
|
-
["--host-url"],
|
|
18
|
-
required=True,
|
|
19
|
-
help="Weaviate instance url",
|
|
20
|
-
),
|
|
21
|
-
click.Option(
|
|
22
|
-
["--class-name"],
|
|
23
|
-
default=None,
|
|
24
|
-
type=str,
|
|
25
|
-
help="Name of the class to push the records into, e.g: Pdf-elements",
|
|
26
|
-
),
|
|
27
|
-
click.Option(
|
|
28
|
-
["--access-token"], default=None, type=str, help="Used to create the bearer token."
|
|
29
|
-
),
|
|
30
|
-
click.Option(
|
|
31
|
-
["--refresh-token"],
|
|
32
|
-
default=None,
|
|
33
|
-
type=str,
|
|
34
|
-
help="Will tie this value to the bearer token. If not provided, "
|
|
35
|
-
"the authentication will expire once the lifetime of the access token is up.",
|
|
36
|
-
),
|
|
37
|
-
click.Option(
|
|
38
|
-
["--api-key"],
|
|
39
|
-
default=None,
|
|
40
|
-
type=str,
|
|
41
|
-
),
|
|
42
|
-
click.Option(
|
|
43
|
-
["--client-secret"],
|
|
44
|
-
default=None,
|
|
45
|
-
type=str,
|
|
46
|
-
),
|
|
47
|
-
click.Option(
|
|
48
|
-
["--scope"],
|
|
49
|
-
default=None,
|
|
50
|
-
type=DelimitedString(),
|
|
51
|
-
),
|
|
52
|
-
click.Option(
|
|
53
|
-
["--username"],
|
|
54
|
-
default=None,
|
|
55
|
-
type=str,
|
|
56
|
-
),
|
|
57
|
-
click.Option(
|
|
58
|
-
["--password"],
|
|
59
|
-
default=None,
|
|
60
|
-
type=str,
|
|
61
|
-
),
|
|
62
|
-
click.Option(
|
|
63
|
-
["--anonymous"],
|
|
64
|
-
is_flag=True,
|
|
65
|
-
default=False,
|
|
66
|
-
type=bool,
|
|
67
|
-
help="if set, all auth values will be ignored",
|
|
68
|
-
),
|
|
69
|
-
]
|
|
70
|
-
return options
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
@dataclass
|
|
74
|
-
class WeaviateCliUploaderConfig(CliConfig):
|
|
75
|
-
@staticmethod
|
|
76
|
-
def get_cli_options() -> list[click.Option]:
|
|
77
|
-
options = [
|
|
78
|
-
click.Option(
|
|
79
|
-
["--batch-size"],
|
|
80
|
-
default=100,
|
|
81
|
-
type=int,
|
|
82
|
-
help="Number of records per batch",
|
|
83
|
-
)
|
|
84
|
-
]
|
|
85
|
-
return options
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
@dataclass
|
|
89
|
-
class WeaviateCliUploadStagerConfig(CliConfig):
|
|
90
|
-
@staticmethod
|
|
91
|
-
def get_cli_options() -> list[click.Option]:
|
|
92
|
-
return []
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
weaviate_dest_cmd = DestCmd(
|
|
96
|
-
cmd_name=CONNECTOR_TYPE,
|
|
97
|
-
connection_config=WeaviateCliConnectionConfig,
|
|
98
|
-
uploader_config=WeaviateCliUploaderConfig,
|
|
99
|
-
upload_stager_config=WeaviateCliUploadStagerConfig,
|
|
100
|
-
)
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
from .chunk import ChunkerCliConfig
|
|
2
|
-
from .embed import EmbedderCliConfig
|
|
3
|
-
from .filter import FilterCliConfig
|
|
4
|
-
from .partition import PartitionerCliConfig
|
|
5
|
-
from .processor import ProcessorCliConfig
|
|
6
|
-
|
|
7
|
-
__all__ = [
|
|
8
|
-
"ChunkerCliConfig",
|
|
9
|
-
"ProcessorCliConfig",
|
|
10
|
-
"PartitionerCliConfig",
|
|
11
|
-
"EmbedderCliConfig",
|
|
12
|
-
"FilterCliConfig",
|
|
13
|
-
]
|
|
@@ -1,89 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
from unstructured.chunking import CHUNK_MAX_CHARS_DEFAULT, CHUNK_MULTI_PAGE_DEFAULT
|
|
5
|
-
|
|
6
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
@dataclass
|
|
10
|
-
class ChunkerCliConfig(CliConfig):
|
|
11
|
-
@staticmethod
|
|
12
|
-
def get_cli_options() -> list[click.Option]:
|
|
13
|
-
options = [
|
|
14
|
-
click.Option(
|
|
15
|
-
["--chunking-strategy"],
|
|
16
|
-
type=str,
|
|
17
|
-
default=None,
|
|
18
|
-
help="The rule-set to use to form chunks. Omit to disable chunking.",
|
|
19
|
-
),
|
|
20
|
-
click.Option(
|
|
21
|
-
["--chunk-combine-text-under-n-chars"],
|
|
22
|
-
type=int,
|
|
23
|
-
help=(
|
|
24
|
-
"Combine consecutive chunks when the first does not exceed this length and"
|
|
25
|
-
" the second will fit without exceeding the hard-maximum length. Only"
|
|
26
|
-
" operative for 'by_title' chunking-strategy."
|
|
27
|
-
),
|
|
28
|
-
),
|
|
29
|
-
click.Option(
|
|
30
|
-
["--chunk-include-orig-elements/--chunk-no-include-orig-elements"],
|
|
31
|
-
is_flag=True,
|
|
32
|
-
default=True,
|
|
33
|
-
help=(
|
|
34
|
-
"When chunking, add the original elements consolidated to form each chunk to"
|
|
35
|
-
" `.metadata.orig_elements` on that chunk."
|
|
36
|
-
),
|
|
37
|
-
),
|
|
38
|
-
click.Option(
|
|
39
|
-
["--chunk-max-characters"],
|
|
40
|
-
type=int,
|
|
41
|
-
default=CHUNK_MAX_CHARS_DEFAULT,
|
|
42
|
-
show_default=True,
|
|
43
|
-
help=(
|
|
44
|
-
"Hard maximum chunk length. No chunk will exceed this length. An oversized"
|
|
45
|
-
" element will be divided by text-splitting to fit this window."
|
|
46
|
-
),
|
|
47
|
-
),
|
|
48
|
-
click.Option(
|
|
49
|
-
["--chunk-multipage-sections/--chunk-no-multipage-sections"],
|
|
50
|
-
is_flag=True,
|
|
51
|
-
default=CHUNK_MULTI_PAGE_DEFAULT,
|
|
52
|
-
help=(
|
|
53
|
-
"Ignore page boundaries when chunking such that elements from two different"
|
|
54
|
-
" pages can appear in the same chunk. Only operative for 'by_title'"
|
|
55
|
-
" chunking-strategy."
|
|
56
|
-
),
|
|
57
|
-
),
|
|
58
|
-
click.Option(
|
|
59
|
-
["--chunk-new-after-n-chars"],
|
|
60
|
-
type=int,
|
|
61
|
-
help=(
|
|
62
|
-
"Soft-maximum chunk length. Another element will not be added to a chunk of"
|
|
63
|
-
" this length even when it would fit without exceeding the hard-maximum"
|
|
64
|
-
" length."
|
|
65
|
-
),
|
|
66
|
-
),
|
|
67
|
-
click.Option(
|
|
68
|
-
["--chunk-overlap"],
|
|
69
|
-
type=int,
|
|
70
|
-
default=0,
|
|
71
|
-
show_default=True,
|
|
72
|
-
help=(
|
|
73
|
-
"Prefix chunk text with last overlap=N characters of prior chunk. Only"
|
|
74
|
-
" applies to oversized chunks divided by text-splitting. To apply overlap to"
|
|
75
|
-
" non-oversized chunks use the --overlap-all option."
|
|
76
|
-
),
|
|
77
|
-
),
|
|
78
|
-
click.Option(
|
|
79
|
-
["--chunk-overlap-all"],
|
|
80
|
-
is_flag=True,
|
|
81
|
-
default=False,
|
|
82
|
-
help=(
|
|
83
|
-
"Apply overlap to chunks formed from whole elements as well as those formed"
|
|
84
|
-
" by text-splitting oversized elements. Overlap length is take from --overlap"
|
|
85
|
-
" option value."
|
|
86
|
-
),
|
|
87
|
-
),
|
|
88
|
-
]
|
|
89
|
-
return options
|