unstructured-ingest 0.0.2.dev0__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cli.py +6 -1
- unstructured_ingest/cli/cmds/__init__.py +4 -4
- unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
- unstructured_ingest/cli/interfaces.py +13 -6
- unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
- unstructured_ingest/connector/biomed.py +12 -5
- unstructured_ingest/connector/confluence.py +3 -3
- unstructured_ingest/connector/github.py +3 -2
- unstructured_ingest/connector/google_drive.py +1 -2
- unstructured_ingest/connector/mongodb.py +1 -2
- unstructured_ingest/connector/notion/client.py +31 -16
- unstructured_ingest/connector/notion/connector.py +3 -2
- unstructured_ingest/connector/registry.py +2 -2
- unstructured_ingest/connector/vectara.py +7 -2
- unstructured_ingest/interfaces.py +13 -9
- unstructured_ingest/pipeline/interfaces.py +8 -3
- unstructured_ingest/pipeline/reformat/chunking.py +13 -9
- unstructured_ingest/pipeline/reformat/embedding.py +3 -3
- unstructured_ingest/runner/__init__.py +2 -2
- unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
- unstructured_ingest/runner/writers/__init__.py +2 -2
- unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
- unstructured_ingest/utils/chunking.py +45 -0
- unstructured_ingest/utils/dep_check.py +1 -1
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/v2/cli/base/cmd.py +66 -12
- unstructured_ingest/v2/cli/base/dest.py +21 -12
- unstructured_ingest/v2/cli/base/src.py +35 -21
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
- unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
- unstructured_ingest/v2/interfaces/__init__.py +2 -1
- unstructured_ingest/v2/interfaces/connector.py +5 -7
- unstructured_ingest/v2/interfaces/downloader.py +17 -8
- unstructured_ingest/v2/interfaces/file_data.py +13 -2
- unstructured_ingest/v2/interfaces/indexer.py +3 -4
- unstructured_ingest/v2/interfaces/process.py +3 -4
- unstructured_ingest/v2/interfaces/processor.py +10 -10
- unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
- unstructured_ingest/v2/interfaces/uploader.py +3 -3
- unstructured_ingest/v2/pipeline/interfaces.py +3 -5
- unstructured_ingest/v2/pipeline/pipeline.py +73 -7
- unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
- unstructured_ingest/v2/pipeline/steps/download.py +90 -24
- unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
- unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
- unstructured_ingest/v2/pipeline/steps/index.py +14 -10
- unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
- unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
- unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +74 -28
- unstructured_ingest/v2/processes/connector_registry.py +8 -2
- unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
- unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +53 -35
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +38 -27
- unstructured_ingest/v2/processes/connectors/chroma.py +38 -27
- unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +95 -31
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +92 -53
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +47 -16
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +23 -13
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +18 -11
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +49 -61
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +46 -13
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +50 -20
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +27 -28
- unstructured_ingest/v2/processes/connectors/google_drive.py +52 -42
- unstructured_ingest/v2/processes/connectors/local.py +36 -28
- unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
- unstructured_ingest/v2/processes/connectors/mongodb.py +32 -22
- unstructured_ingest/v2/processes/connectors/onedrive.py +31 -16
- unstructured_ingest/v2/processes/connectors/opensearch.py +81 -43
- unstructured_ingest/v2/processes/connectors/pinecone.py +29 -23
- unstructured_ingest/v2/processes/connectors/salesforce.py +36 -26
- unstructured_ingest/v2/processes/connectors/sharepoint.py +64 -33
- unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
- unstructured_ingest/v2/processes/connectors/sql.py +52 -39
- unstructured_ingest/v2/processes/connectors/weaviate.py +35 -18
- unstructured_ingest/v2/processes/embedder.py +106 -47
- unstructured_ingest/v2/processes/filter.py +60 -0
- unstructured_ingest/v2/processes/partitioner.py +79 -33
- unstructured_ingest/v2/processes/uncompress.py +3 -3
- unstructured_ingest/v2/utils.py +45 -0
- unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +92 -116
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
- unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
- unstructured_ingest/v2/cli/cmds/astra.py +0 -85
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
- unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -77
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
- unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
- unstructured_ingest/v2/cli/cmds/local.py +0 -60
- unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
- unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
- unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
- unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
- unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
- unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
- unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
- unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
- unstructured_ingest/v2/cli/cmds/sql.py +0 -84
- unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
- unstructured_ingest/v2/cli/configs/__init__.py +0 -6
- unstructured_ingest/v2/cli/configs/chunk.py +0 -89
- unstructured_ingest/v2/cli/configs/embed.py +0 -74
- unstructured_ingest/v2/cli/configs/partition.py +0 -99
- unstructured_ingest/v2/cli/configs/processor.py +0 -88
- unstructured_ingest/v2/cli/interfaces.py +0 -27
- unstructured_ingest/v2/pipeline/utils.py +0 -15
- unstructured_ingest-0.0.2.dev0.dist-info/METADATA +0 -321
- /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
|
@@ -1,93 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.base import DestCmd, SrcCmd
|
|
6
|
-
from unstructured_ingest.v2.cli.cmds.elasticsearch import (
|
|
7
|
-
ElasticsearchCliDownloadConfig,
|
|
8
|
-
ElasticsearchCliIndexerConfig,
|
|
9
|
-
ElasticsearchCliUploadStagerConfig,
|
|
10
|
-
ElasticsearchUploaderConfig,
|
|
11
|
-
)
|
|
12
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
13
|
-
from unstructured_ingest.v2.cli.utils import DelimitedString
|
|
14
|
-
from unstructured_ingest.v2.processes.connectors.opensearch import CONNECTOR_TYPE
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
@dataclass
|
|
18
|
-
class OpenSearchCliConnectionConfig(CliConfig):
|
|
19
|
-
@staticmethod
|
|
20
|
-
def get_cli_options() -> list[click.Option]:
|
|
21
|
-
options = [
|
|
22
|
-
click.Option(
|
|
23
|
-
["--hosts"],
|
|
24
|
-
type=DelimitedString(),
|
|
25
|
-
help='List of the OpenSearch hosts to connect to, e.g. "http://localhost:9200"',
|
|
26
|
-
),
|
|
27
|
-
click.Option(
|
|
28
|
-
["--username"], type=str, default=None, help="username when using basic auth"
|
|
29
|
-
),
|
|
30
|
-
click.Option(
|
|
31
|
-
["--password"],
|
|
32
|
-
type=str,
|
|
33
|
-
default=None,
|
|
34
|
-
help="password when using basic auth",
|
|
35
|
-
),
|
|
36
|
-
click.Option(
|
|
37
|
-
["--use-ssl"],
|
|
38
|
-
type=bool,
|
|
39
|
-
default=False,
|
|
40
|
-
is_flag=True,
|
|
41
|
-
help="use ssl for the connection",
|
|
42
|
-
),
|
|
43
|
-
click.Option(
|
|
44
|
-
["--verify-certs"],
|
|
45
|
-
type=bool,
|
|
46
|
-
default=False,
|
|
47
|
-
is_flag=True,
|
|
48
|
-
help="whether to verify SSL certificates",
|
|
49
|
-
),
|
|
50
|
-
click.Option(
|
|
51
|
-
["--ssl-show-warn"],
|
|
52
|
-
type=bool,
|
|
53
|
-
default=False,
|
|
54
|
-
is_flag=True,
|
|
55
|
-
help="show warning when verify certs is disabled",
|
|
56
|
-
),
|
|
57
|
-
click.Option(
|
|
58
|
-
["--ca-certs"],
|
|
59
|
-
type=click.Path(),
|
|
60
|
-
default=None,
|
|
61
|
-
help="path to CA bundle",
|
|
62
|
-
),
|
|
63
|
-
click.Option(
|
|
64
|
-
["--client-cert"],
|
|
65
|
-
type=click.Path(),
|
|
66
|
-
default=None,
|
|
67
|
-
help="path to the file containing the private key and the certificate,"
|
|
68
|
-
" or cert only if using client_key",
|
|
69
|
-
),
|
|
70
|
-
click.Option(
|
|
71
|
-
["--client-key"],
|
|
72
|
-
type=click.Path(),
|
|
73
|
-
default=None,
|
|
74
|
-
help="path to the file containing the private key"
|
|
75
|
-
" if using separate cert and key files",
|
|
76
|
-
),
|
|
77
|
-
]
|
|
78
|
-
return options
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
opensearch_src_cmd = SrcCmd(
|
|
82
|
-
cmd_name=CONNECTOR_TYPE,
|
|
83
|
-
connection_config=OpenSearchCliConnectionConfig,
|
|
84
|
-
indexer_config=ElasticsearchCliIndexerConfig,
|
|
85
|
-
downloader_config=ElasticsearchCliDownloadConfig,
|
|
86
|
-
)
|
|
87
|
-
|
|
88
|
-
opensearch_dest_cmd = DestCmd(
|
|
89
|
-
cmd_name=CONNECTOR_TYPE,
|
|
90
|
-
connection_config=OpenSearchCliConnectionConfig,
|
|
91
|
-
upload_stager_config=ElasticsearchCliUploadStagerConfig,
|
|
92
|
-
uploader_config=ElasticsearchUploaderConfig,
|
|
93
|
-
)
|
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.base import DestCmd
|
|
6
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
7
|
-
from unstructured_ingest.v2.processes.connectors.pinecone import CONNECTOR_TYPE
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
@dataclass
|
|
11
|
-
class PineconeCliConnectionConfig(CliConfig):
|
|
12
|
-
@staticmethod
|
|
13
|
-
def get_cli_options() -> list[click.Option]:
|
|
14
|
-
options = [
|
|
15
|
-
click.Option(
|
|
16
|
-
["--api-key"],
|
|
17
|
-
required=True,
|
|
18
|
-
type=str,
|
|
19
|
-
help="API key for Pinecone.",
|
|
20
|
-
),
|
|
21
|
-
click.Option(
|
|
22
|
-
["--index-name"],
|
|
23
|
-
required=True,
|
|
24
|
-
type=str,
|
|
25
|
-
help="Name of the index to connect to. Example: my-index",
|
|
26
|
-
),
|
|
27
|
-
click.Option(
|
|
28
|
-
["--environment"],
|
|
29
|
-
required=True,
|
|
30
|
-
type=str,
|
|
31
|
-
help="Environment to connect to. Example: us-east-1",
|
|
32
|
-
),
|
|
33
|
-
]
|
|
34
|
-
return options
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
@dataclass
|
|
38
|
-
class PineconeCliUploaderConfig(CliConfig):
|
|
39
|
-
@staticmethod
|
|
40
|
-
def get_cli_options() -> list[click.Option]:
|
|
41
|
-
options = [
|
|
42
|
-
click.Option(
|
|
43
|
-
["--batch-size"],
|
|
44
|
-
default=100,
|
|
45
|
-
type=int,
|
|
46
|
-
help="Number of records per batch",
|
|
47
|
-
),
|
|
48
|
-
click.Option(
|
|
49
|
-
["--num-processes"],
|
|
50
|
-
default=4,
|
|
51
|
-
type=int,
|
|
52
|
-
help="Number of processes to use for uploading",
|
|
53
|
-
),
|
|
54
|
-
]
|
|
55
|
-
return options
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
pinecone_dest_cmd = DestCmd(
|
|
59
|
-
cmd_name=CONNECTOR_TYPE,
|
|
60
|
-
connection_config=PineconeCliConnectionConfig,
|
|
61
|
-
uploader_config=PineconeCliUploaderConfig,
|
|
62
|
-
)
|
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.base import SrcCmd
|
|
6
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
7
|
-
from unstructured_ingest.v2.cli.utils import DelimitedString
|
|
8
|
-
from unstructured_ingest.v2.processes.connectors.salesforce import (
|
|
9
|
-
ACCEPTED_CATEGORIES,
|
|
10
|
-
CONNECTOR_TYPE,
|
|
11
|
-
)
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
@dataclass
|
|
15
|
-
class SalesforceCliConnectionConfig(CliConfig):
|
|
16
|
-
@staticmethod
|
|
17
|
-
def get_cli_options() -> list[click.Option]:
|
|
18
|
-
options = [
|
|
19
|
-
click.Option(
|
|
20
|
-
["--username"],
|
|
21
|
-
required=True,
|
|
22
|
-
type=str,
|
|
23
|
-
help="Salesforce username usually looks like an email.",
|
|
24
|
-
),
|
|
25
|
-
click.Option(
|
|
26
|
-
["--consumer-key"],
|
|
27
|
-
required=True,
|
|
28
|
-
type=str,
|
|
29
|
-
help="For the Salesforce JWT auth. Found in Consumer Details.",
|
|
30
|
-
),
|
|
31
|
-
click.Option(
|
|
32
|
-
["--private-key"],
|
|
33
|
-
required=True,
|
|
34
|
-
type=str,
|
|
35
|
-
help="Path to the private key or its contents for the Salesforce JWT auth. "
|
|
36
|
-
"Key file is usually named server.key.",
|
|
37
|
-
),
|
|
38
|
-
]
|
|
39
|
-
return options
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
@dataclass
|
|
43
|
-
class SalesforceCliIndexerConfig(CliConfig):
|
|
44
|
-
@staticmethod
|
|
45
|
-
def get_cli_options() -> list[click.Option]:
|
|
46
|
-
possible_categories = ACCEPTED_CATEGORIES
|
|
47
|
-
options = [
|
|
48
|
-
click.Option(
|
|
49
|
-
["--categories"],
|
|
50
|
-
default=None,
|
|
51
|
-
required=True,
|
|
52
|
-
type=DelimitedString(choices=possible_categories),
|
|
53
|
-
help="Comma-delimited salesforce categories to download. "
|
|
54
|
-
"Currently only {}.".format(", ".join(possible_categories)),
|
|
55
|
-
),
|
|
56
|
-
]
|
|
57
|
-
return options
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
@dataclass
|
|
61
|
-
class SalesforceCliDownloadConfig(CliConfig):
|
|
62
|
-
@staticmethod
|
|
63
|
-
def get_cli_options() -> list[click.Option]:
|
|
64
|
-
options = [
|
|
65
|
-
click.Option(
|
|
66
|
-
["--download-dir"],
|
|
67
|
-
help="Where files are downloaded to, defaults to a location at"
|
|
68
|
-
"`$HOME/.cache/unstructured_ingest/<connector name>/<SHA256>`.",
|
|
69
|
-
),
|
|
70
|
-
]
|
|
71
|
-
return options
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
salesforce_src_cmd = SrcCmd(
|
|
75
|
-
cmd_name=CONNECTOR_TYPE,
|
|
76
|
-
connection_config=SalesforceCliConnectionConfig,
|
|
77
|
-
indexer_config=SalesforceCliIndexerConfig,
|
|
78
|
-
downloader_config=SalesforceCliDownloadConfig,
|
|
79
|
-
)
|
|
@@ -1,112 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.base import SrcCmd
|
|
6
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
7
|
-
from unstructured_ingest.v2.processes.connectors.sharepoint import CONNECTOR_TYPE
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
@dataclass
|
|
11
|
-
class SharepointCliConnectionConfig(CliConfig):
|
|
12
|
-
@staticmethod
|
|
13
|
-
def get_cli_options() -> list[click.Option]:
|
|
14
|
-
options = [
|
|
15
|
-
click.Option(
|
|
16
|
-
["--client-id"],
|
|
17
|
-
default=None,
|
|
18
|
-
type=str,
|
|
19
|
-
help="Sharepoint app client ID",
|
|
20
|
-
),
|
|
21
|
-
click.Option(
|
|
22
|
-
["--client-cred"],
|
|
23
|
-
default=None,
|
|
24
|
-
type=str,
|
|
25
|
-
help="Sharepoint app secret",
|
|
26
|
-
),
|
|
27
|
-
click.Option(
|
|
28
|
-
["--site"],
|
|
29
|
-
default=None,
|
|
30
|
-
type=str,
|
|
31
|
-
help="Sharepoint site url. Process either base url e.g \
|
|
32
|
-
https://[tenant].sharepoint.com or relative sites \
|
|
33
|
-
https://[tenant].sharepoint.com/sites/<site_name>. \
|
|
34
|
-
To process all sites within the tenant pass a site url as \
|
|
35
|
-
https://[tenant]-admin.sharepoint.com.\
|
|
36
|
-
This requires the app to be registered at a tenant level",
|
|
37
|
-
),
|
|
38
|
-
click.Option(
|
|
39
|
-
["--permissions-application-id"],
|
|
40
|
-
type=str,
|
|
41
|
-
help="Microsoft Graph API application id",
|
|
42
|
-
),
|
|
43
|
-
click.Option(
|
|
44
|
-
["--permissions-client-cred"],
|
|
45
|
-
type=str,
|
|
46
|
-
help="Microsoft Graph API application credentials",
|
|
47
|
-
),
|
|
48
|
-
click.Option(
|
|
49
|
-
["--permissions-tenant"],
|
|
50
|
-
type=str,
|
|
51
|
-
help="e.g https://contoso.onmicrosoft.com to get permissions data within tenant.",
|
|
52
|
-
),
|
|
53
|
-
]
|
|
54
|
-
return options
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
@dataclass
|
|
58
|
-
class SharepointCliIndexerConfig(CliConfig):
|
|
59
|
-
@staticmethod
|
|
60
|
-
def get_cli_options() -> list[click.Option]:
|
|
61
|
-
options = [
|
|
62
|
-
click.Option(
|
|
63
|
-
["--path"],
|
|
64
|
-
default=None,
|
|
65
|
-
type=str,
|
|
66
|
-
help="Path from which to start parsing files. If the connector is to \
|
|
67
|
-
process all sites within the tenant this filter will be applied to \
|
|
68
|
-
all sites document libraries.",
|
|
69
|
-
),
|
|
70
|
-
click.Option(
|
|
71
|
-
["--recursive"],
|
|
72
|
-
is_flag=True,
|
|
73
|
-
default=False,
|
|
74
|
-
help="Recursively download files in their respective folders "
|
|
75
|
-
"otherwise stop at the files in provided folder level.",
|
|
76
|
-
),
|
|
77
|
-
click.Option(
|
|
78
|
-
["--omit-files"],
|
|
79
|
-
is_flag=True,
|
|
80
|
-
default=False,
|
|
81
|
-
help="Don't process files.",
|
|
82
|
-
),
|
|
83
|
-
click.Option(
|
|
84
|
-
["--omit-pages"],
|
|
85
|
-
is_flag=True,
|
|
86
|
-
default=False,
|
|
87
|
-
help="Don't process site pages.",
|
|
88
|
-
),
|
|
89
|
-
]
|
|
90
|
-
return options
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
@dataclass
|
|
94
|
-
class SharepointCliDownloadConfig(CliConfig):
|
|
95
|
-
@staticmethod
|
|
96
|
-
def get_cli_options() -> list[click.Option]:
|
|
97
|
-
options = [
|
|
98
|
-
click.Option(
|
|
99
|
-
["--download-dir"],
|
|
100
|
-
help="Where files are downloaded to, defaults to a location at"
|
|
101
|
-
"`$HOME/.cache/unstructured_ingest/<connector name>/<SHA256>`.",
|
|
102
|
-
),
|
|
103
|
-
]
|
|
104
|
-
return options
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
sharepoint_drive_src_cmd = SrcCmd(
|
|
108
|
-
cmd_name=CONNECTOR_TYPE,
|
|
109
|
-
connection_config=SharepointCliConnectionConfig,
|
|
110
|
-
indexer_config=SharepointCliIndexerConfig,
|
|
111
|
-
downloader_config=SharepointCliDownloadConfig,
|
|
112
|
-
)
|
|
@@ -1,96 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.base import DestCmd
|
|
6
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
7
|
-
from unstructured_ingest.v2.processes.connectors.singlestore import CONNECTOR_TYPE
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
@dataclass
|
|
11
|
-
class SingleStoreCliConnectionConfig(CliConfig):
|
|
12
|
-
@staticmethod
|
|
13
|
-
def get_cli_options() -> list[click.Option]:
|
|
14
|
-
options = [
|
|
15
|
-
click.Option(
|
|
16
|
-
["--host"],
|
|
17
|
-
required=False,
|
|
18
|
-
type=str,
|
|
19
|
-
default=None,
|
|
20
|
-
help="SingleStore host",
|
|
21
|
-
),
|
|
22
|
-
click.Option(
|
|
23
|
-
["--port"],
|
|
24
|
-
required=False,
|
|
25
|
-
type=int,
|
|
26
|
-
default=None,
|
|
27
|
-
help="SingleStore port",
|
|
28
|
-
),
|
|
29
|
-
click.Option(
|
|
30
|
-
["--user"],
|
|
31
|
-
required=False,
|
|
32
|
-
type=str,
|
|
33
|
-
default=None,
|
|
34
|
-
help="SingleStore user",
|
|
35
|
-
),
|
|
36
|
-
click.Option(
|
|
37
|
-
["--password"],
|
|
38
|
-
required=False,
|
|
39
|
-
type=str,
|
|
40
|
-
default=None,
|
|
41
|
-
help="SingleStore password",
|
|
42
|
-
),
|
|
43
|
-
click.Option(
|
|
44
|
-
["--database"],
|
|
45
|
-
required=False,
|
|
46
|
-
type=str,
|
|
47
|
-
default=None,
|
|
48
|
-
help="SingleStore database",
|
|
49
|
-
),
|
|
50
|
-
]
|
|
51
|
-
return options
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
@dataclass
|
|
55
|
-
class SingleStoreCliUploaderConfig(CliConfig):
|
|
56
|
-
@staticmethod
|
|
57
|
-
def get_cli_options() -> list[click.Option]:
|
|
58
|
-
options = [
|
|
59
|
-
click.Option(
|
|
60
|
-
["--drop-empty-cols"],
|
|
61
|
-
required=False,
|
|
62
|
-
type=bool,
|
|
63
|
-
is_flag=True,
|
|
64
|
-
default=False,
|
|
65
|
-
help="Drop any columns that have no data",
|
|
66
|
-
),
|
|
67
|
-
]
|
|
68
|
-
return options
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
@dataclass
|
|
72
|
-
class SingleStoreCliUploadStagerConfig(CliConfig):
|
|
73
|
-
@staticmethod
|
|
74
|
-
def get_cli_options() -> list[click.Option]:
|
|
75
|
-
return [
|
|
76
|
-
click.Option(
|
|
77
|
-
["--table-name"],
|
|
78
|
-
required=False,
|
|
79
|
-
type=str,
|
|
80
|
-
help="SingleStore table to write contents to",
|
|
81
|
-
),
|
|
82
|
-
click.Option(
|
|
83
|
-
["--batch-size"],
|
|
84
|
-
required=False,
|
|
85
|
-
type=click.IntRange(min=1),
|
|
86
|
-
help="Batch size when writing to SingleStore",
|
|
87
|
-
),
|
|
88
|
-
]
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
singlestore_dest_cmd = DestCmd(
|
|
92
|
-
cmd_name=CONNECTOR_TYPE,
|
|
93
|
-
connection_config=SingleStoreCliConnectionConfig,
|
|
94
|
-
uploader_config=SingleStoreCliUploaderConfig,
|
|
95
|
-
upload_stager_config=SingleStoreCliUploadStagerConfig,
|
|
96
|
-
)
|
|
@@ -1,84 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.base import DestCmd
|
|
6
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
7
|
-
from unstructured_ingest.v2.processes.connectors.sql import CONNECTOR_TYPE
|
|
8
|
-
|
|
9
|
-
SQL_DRIVERS = {"postgresql", "sqlite"}
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
@dataclass
|
|
13
|
-
class SQLCliConnectionConfig(CliConfig):
|
|
14
|
-
@staticmethod
|
|
15
|
-
def get_cli_options() -> list[click.Option]:
|
|
16
|
-
options = [
|
|
17
|
-
click.Option(
|
|
18
|
-
["--db-type"],
|
|
19
|
-
required=True,
|
|
20
|
-
type=click.Choice(SQL_DRIVERS),
|
|
21
|
-
help="Type of the database backend",
|
|
22
|
-
),
|
|
23
|
-
click.Option(
|
|
24
|
-
["--username"],
|
|
25
|
-
default=None,
|
|
26
|
-
type=str,
|
|
27
|
-
help="DB username",
|
|
28
|
-
),
|
|
29
|
-
click.Option(
|
|
30
|
-
["--password"],
|
|
31
|
-
default=None,
|
|
32
|
-
type=str,
|
|
33
|
-
help="DB password",
|
|
34
|
-
),
|
|
35
|
-
click.Option(
|
|
36
|
-
["--host"],
|
|
37
|
-
default=None,
|
|
38
|
-
type=str,
|
|
39
|
-
help="DB host",
|
|
40
|
-
),
|
|
41
|
-
click.Option(
|
|
42
|
-
["--port"],
|
|
43
|
-
default=None,
|
|
44
|
-
type=int,
|
|
45
|
-
help="DB host connection port",
|
|
46
|
-
),
|
|
47
|
-
click.Option(
|
|
48
|
-
["--database"],
|
|
49
|
-
default=None,
|
|
50
|
-
type=str,
|
|
51
|
-
help="Database name. For sqlite databases, this is the path to the .db file.",
|
|
52
|
-
),
|
|
53
|
-
]
|
|
54
|
-
return options
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
@dataclass
|
|
58
|
-
class SQLCliUploaderConfig(CliConfig):
|
|
59
|
-
@staticmethod
|
|
60
|
-
def get_cli_options() -> list[click.Option]:
|
|
61
|
-
options = [
|
|
62
|
-
click.Option(
|
|
63
|
-
["--batch-size"],
|
|
64
|
-
default=100,
|
|
65
|
-
type=int,
|
|
66
|
-
help="Number of records per batch",
|
|
67
|
-
)
|
|
68
|
-
]
|
|
69
|
-
return options
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
@dataclass
|
|
73
|
-
class SQLCliUploadStagerConfig(CliConfig):
|
|
74
|
-
@staticmethod
|
|
75
|
-
def get_cli_options() -> list[click.Option]:
|
|
76
|
-
return []
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
sql_dest_cmd = DestCmd(
|
|
80
|
-
cmd_name=CONNECTOR_TYPE,
|
|
81
|
-
connection_config=SQLCliConnectionConfig,
|
|
82
|
-
uploader_config=SQLCliUploaderConfig,
|
|
83
|
-
upload_stager_config=SQLCliUploadStagerConfig,
|
|
84
|
-
)
|
|
@@ -1,100 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.base import DestCmd
|
|
6
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
7
|
-
from unstructured_ingest.v2.cli.utils import DelimitedString
|
|
8
|
-
from unstructured_ingest.v2.processes.connectors.weaviate import CONNECTOR_TYPE
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
@dataclass
|
|
12
|
-
class WeaviateCliConnectionConfig(CliConfig):
|
|
13
|
-
@staticmethod
|
|
14
|
-
def get_cli_options() -> list[click.Option]:
|
|
15
|
-
options = [
|
|
16
|
-
click.Option(
|
|
17
|
-
["--host-url"],
|
|
18
|
-
required=True,
|
|
19
|
-
help="Weaviate instance url",
|
|
20
|
-
),
|
|
21
|
-
click.Option(
|
|
22
|
-
["--class-name"],
|
|
23
|
-
default=None,
|
|
24
|
-
type=str,
|
|
25
|
-
help="Name of the class to push the records into, e.g: Pdf-elements",
|
|
26
|
-
),
|
|
27
|
-
click.Option(
|
|
28
|
-
["--access-token"], default=None, type=str, help="Used to create the bearer token."
|
|
29
|
-
),
|
|
30
|
-
click.Option(
|
|
31
|
-
["--refresh-token"],
|
|
32
|
-
default=None,
|
|
33
|
-
type=str,
|
|
34
|
-
help="Will tie this value to the bearer token. If not provided, "
|
|
35
|
-
"the authentication will expire once the lifetime of the access token is up.",
|
|
36
|
-
),
|
|
37
|
-
click.Option(
|
|
38
|
-
["--api-key"],
|
|
39
|
-
default=None,
|
|
40
|
-
type=str,
|
|
41
|
-
),
|
|
42
|
-
click.Option(
|
|
43
|
-
["--client-secret"],
|
|
44
|
-
default=None,
|
|
45
|
-
type=str,
|
|
46
|
-
),
|
|
47
|
-
click.Option(
|
|
48
|
-
["--scope"],
|
|
49
|
-
default=None,
|
|
50
|
-
type=DelimitedString(),
|
|
51
|
-
),
|
|
52
|
-
click.Option(
|
|
53
|
-
["--username"],
|
|
54
|
-
default=None,
|
|
55
|
-
type=str,
|
|
56
|
-
),
|
|
57
|
-
click.Option(
|
|
58
|
-
["--password"],
|
|
59
|
-
default=None,
|
|
60
|
-
type=str,
|
|
61
|
-
),
|
|
62
|
-
click.Option(
|
|
63
|
-
["--anonymous"],
|
|
64
|
-
is_flag=True,
|
|
65
|
-
default=False,
|
|
66
|
-
type=bool,
|
|
67
|
-
help="if set, all auth values will be ignored",
|
|
68
|
-
),
|
|
69
|
-
]
|
|
70
|
-
return options
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
@dataclass
|
|
74
|
-
class WeaviateCliUploaderConfig(CliConfig):
|
|
75
|
-
@staticmethod
|
|
76
|
-
def get_cli_options() -> list[click.Option]:
|
|
77
|
-
options = [
|
|
78
|
-
click.Option(
|
|
79
|
-
["--batch-size"],
|
|
80
|
-
default=100,
|
|
81
|
-
type=int,
|
|
82
|
-
help="Number of records per batch",
|
|
83
|
-
)
|
|
84
|
-
]
|
|
85
|
-
return options
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
@dataclass
|
|
89
|
-
class WeaviateCliUploadStagerConfig(CliConfig):
|
|
90
|
-
@staticmethod
|
|
91
|
-
def get_cli_options() -> list[click.Option]:
|
|
92
|
-
return []
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
weaviate_dest_cmd = DestCmd(
|
|
96
|
-
cmd_name=CONNECTOR_TYPE,
|
|
97
|
-
connection_config=WeaviateCliConnectionConfig,
|
|
98
|
-
uploader_config=WeaviateCliUploaderConfig,
|
|
99
|
-
upload_stager_config=WeaviateCliUploadStagerConfig,
|
|
100
|
-
)
|