unstructured-ingest 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cli.py +6 -1
- unstructured_ingest/cli/cmds/__init__.py +4 -4
- unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
- unstructured_ingest/cli/interfaces.py +13 -6
- unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
- unstructured_ingest/connector/biomed.py +12 -5
- unstructured_ingest/connector/confluence.py +3 -3
- unstructured_ingest/connector/github.py +3 -2
- unstructured_ingest/connector/google_drive.py +1 -2
- unstructured_ingest/connector/mongodb.py +1 -2
- unstructured_ingest/connector/notion/client.py +31 -16
- unstructured_ingest/connector/notion/connector.py +3 -2
- unstructured_ingest/connector/registry.py +2 -2
- unstructured_ingest/connector/vectara.py +7 -2
- unstructured_ingest/interfaces.py +13 -9
- unstructured_ingest/pipeline/interfaces.py +8 -3
- unstructured_ingest/pipeline/reformat/chunking.py +13 -9
- unstructured_ingest/pipeline/reformat/embedding.py +3 -3
- unstructured_ingest/runner/__init__.py +2 -2
- unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
- unstructured_ingest/runner/writers/__init__.py +2 -2
- unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
- unstructured_ingest/utils/chunking.py +45 -0
- unstructured_ingest/utils/dep_check.py +1 -1
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/v2/cli/base/cmd.py +57 -13
- unstructured_ingest/v2/cli/base/dest.py +21 -12
- unstructured_ingest/v2/cli/base/src.py +35 -23
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
- unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
- unstructured_ingest/v2/interfaces/connector.py +5 -7
- unstructured_ingest/v2/interfaces/downloader.py +8 -5
- unstructured_ingest/v2/interfaces/file_data.py +8 -2
- unstructured_ingest/v2/interfaces/indexer.py +3 -4
- unstructured_ingest/v2/interfaces/processor.py +10 -10
- unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
- unstructured_ingest/v2/interfaces/uploader.py +3 -3
- unstructured_ingest/v2/pipeline/pipeline.py +1 -5
- unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
- unstructured_ingest/v2/pipeline/steps/download.py +13 -11
- unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
- unstructured_ingest/v2/pipeline/steps/filter.py +1 -6
- unstructured_ingest/v2/pipeline/steps/index.py +14 -10
- unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
- unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
- unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +74 -28
- unstructured_ingest/v2/processes/connector_registry.py +8 -2
- unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
- unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +45 -35
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +30 -27
- unstructured_ingest/v2/processes/connectors/chroma.py +30 -21
- unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +87 -32
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +70 -45
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +39 -16
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +15 -13
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +10 -11
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +20 -34
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +38 -13
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +31 -17
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -28
- unstructured_ingest/v2/processes/connectors/google_drive.py +40 -34
- unstructured_ingest/v2/processes/connectors/local.py +22 -14
- unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
- unstructured_ingest/v2/processes/connectors/mongodb.py +22 -18
- unstructured_ingest/v2/processes/connectors/onedrive.py +17 -14
- unstructured_ingest/v2/processes/connectors/opensearch.py +66 -56
- unstructured_ingest/v2/processes/connectors/pinecone.py +23 -20
- unstructured_ingest/v2/processes/connectors/salesforce.py +26 -18
- unstructured_ingest/v2/processes/connectors/sharepoint.py +51 -26
- unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
- unstructured_ingest/v2/processes/connectors/sql.py +29 -31
- unstructured_ingest/v2/processes/connectors/weaviate.py +22 -13
- unstructured_ingest/v2/processes/embedder.py +106 -47
- unstructured_ingest/v2/processes/filter.py +11 -5
- unstructured_ingest/v2/processes/partitioner.py +79 -33
- unstructured_ingest/v2/processes/uncompress.py +3 -3
- unstructured_ingest/v2/utils.py +45 -0
- unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +89 -116
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
- unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
- unstructured_ingest/v2/cli/cmds/astra.py +0 -85
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
- unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -69
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
- unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
- unstructured_ingest/v2/cli/cmds/local.py +0 -52
- unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
- unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
- unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
- unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
- unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
- unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
- unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
- unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
- unstructured_ingest/v2/cli/cmds/sql.py +0 -84
- unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
- unstructured_ingest/v2/cli/configs/__init__.py +0 -13
- unstructured_ingest/v2/cli/configs/chunk.py +0 -89
- unstructured_ingest/v2/cli/configs/embed.py +0 -74
- unstructured_ingest/v2/cli/configs/filter.py +0 -28
- unstructured_ingest/v2/cli/configs/partition.py +0 -99
- unstructured_ingest/v2/cli/configs/processor.py +0 -88
- unstructured_ingest/v2/cli/interfaces.py +0 -27
- unstructured_ingest/v2/pipeline/utils.py +0 -15
- unstructured_ingest-0.0.3.dist-info/METADATA +0 -175
- /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
|
@@ -1,81 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.base import DestCmd, SrcCmd
|
|
6
|
-
from unstructured_ingest.v2.cli.cmds.fsspec.fsspec import (
|
|
7
|
-
FsspecCliDownloadConfig,
|
|
8
|
-
FsspecCliIndexerConfig,
|
|
9
|
-
FsspecCliUploaderConfig,
|
|
10
|
-
)
|
|
11
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
12
|
-
from unstructured_ingest.v2.cli.utils import FileOrJson
|
|
13
|
-
from unstructured_ingest.v2.processes.connectors.fsspec.gcs import (
|
|
14
|
-
CONNECTOR_TYPE,
|
|
15
|
-
)
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
@dataclass
|
|
19
|
-
class GcsCliDownloadConfig(FsspecCliDownloadConfig):
|
|
20
|
-
pass
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
@dataclass
|
|
24
|
-
class GcsCliIndexerConfig(FsspecCliIndexerConfig):
|
|
25
|
-
pass
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
@dataclass
|
|
29
|
-
class GcsCliConnectionConfig(CliConfig):
|
|
30
|
-
@staticmethod
|
|
31
|
-
def get_cli_options() -> list[click.Option]:
|
|
32
|
-
help_string = """
|
|
33
|
-
Options:
|
|
34
|
-
- ``None``, GCSFS will attempt to guess your credentials in the
|
|
35
|
-
following order: gcloud CLI default, gcsfs cached token, google compute
|
|
36
|
-
metadata service, anonymous.
|
|
37
|
-
- ``'google_default'``, your default gcloud credentials will be used,
|
|
38
|
-
which are typically established by doing ``gcloud login`` in a terminal.
|
|
39
|
-
- ``'cache'``, credentials from previously successful gcsfs
|
|
40
|
-
authentication will be used (use this after "browser" auth succeeded)
|
|
41
|
-
- ``'anon'``, no authentication is performed, and you can only
|
|
42
|
-
access data which is accessible to allUsers (in this case, the project and
|
|
43
|
-
access level parameters are meaningless)
|
|
44
|
-
- ``'browser'``, you get an access code with which you can
|
|
45
|
-
authenticate via a specially provided URL
|
|
46
|
-
- if ``'cloud'``, we assume we are running within google compute
|
|
47
|
-
or google container engine, and query the internal metadata directly for
|
|
48
|
-
a token.
|
|
49
|
-
- you may supply a token generated by the
|
|
50
|
-
[gcloud](https://cloud.google.com/sdk/docs/)
|
|
51
|
-
utility; this is either a python dictionary or the name of a file
|
|
52
|
-
containing the JSON returned by logging in with the gcloud CLI tool.
|
|
53
|
-
"""
|
|
54
|
-
options = [
|
|
55
|
-
click.Option(
|
|
56
|
-
["--service-account-key"],
|
|
57
|
-
default=None,
|
|
58
|
-
type=FileOrJson(allow_raw_str=True),
|
|
59
|
-
help=help_string,
|
|
60
|
-
),
|
|
61
|
-
]
|
|
62
|
-
return options
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
@dataclass
|
|
66
|
-
class GcsUploaderConfig(FsspecCliUploaderConfig):
|
|
67
|
-
pass
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
gcs_src_cmd = SrcCmd(
|
|
71
|
-
cmd_name=CONNECTOR_TYPE,
|
|
72
|
-
indexer_config=GcsCliIndexerConfig,
|
|
73
|
-
connection_config=GcsCliConnectionConfig,
|
|
74
|
-
downloader_config=GcsCliDownloadConfig,
|
|
75
|
-
)
|
|
76
|
-
|
|
77
|
-
gcs_dest_cmd = DestCmd(
|
|
78
|
-
cmd_name=CONNECTOR_TYPE,
|
|
79
|
-
connection_config=GcsCliConnectionConfig,
|
|
80
|
-
uploader_config=GcsUploaderConfig,
|
|
81
|
-
)
|
|
@@ -1,84 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.base import DestCmd, SrcCmd
|
|
6
|
-
from unstructured_ingest.v2.cli.cmds.fsspec.fsspec import (
|
|
7
|
-
FsspecCliDownloadConfig,
|
|
8
|
-
FsspecCliIndexerConfig,
|
|
9
|
-
FsspecCliUploaderConfig,
|
|
10
|
-
)
|
|
11
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
12
|
-
from unstructured_ingest.v2.processes.connectors.fsspec.s3 import (
|
|
13
|
-
CONNECTOR_TYPE,
|
|
14
|
-
)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
@dataclass
|
|
18
|
-
class S3CliDownloadConfig(FsspecCliDownloadConfig):
|
|
19
|
-
pass
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
@dataclass
|
|
23
|
-
class S3CliIndexerConfig(FsspecCliIndexerConfig):
|
|
24
|
-
pass
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
@dataclass
|
|
28
|
-
class S3CliConnectionConfig(CliConfig):
|
|
29
|
-
@staticmethod
|
|
30
|
-
def get_cli_options() -> list[click.Option]:
|
|
31
|
-
options = [
|
|
32
|
-
click.Option(
|
|
33
|
-
["--anonymous"],
|
|
34
|
-
is_flag=True,
|
|
35
|
-
default=False,
|
|
36
|
-
help="Connect to s3 without local AWS credentials.",
|
|
37
|
-
),
|
|
38
|
-
click.Option(
|
|
39
|
-
["--endpoint-url"],
|
|
40
|
-
type=str,
|
|
41
|
-
default=None,
|
|
42
|
-
help="Use this endpoint_url, if specified. Needed for "
|
|
43
|
-
"connecting to non-AWS S3 buckets.",
|
|
44
|
-
),
|
|
45
|
-
click.Option(
|
|
46
|
-
["--key"],
|
|
47
|
-
type=str,
|
|
48
|
-
default=None,
|
|
49
|
-
help="If not anonymous, use this access key ID, if specified. Takes precedence "
|
|
50
|
-
"over `aws_access_key_id` in client_kwargs.",
|
|
51
|
-
),
|
|
52
|
-
click.Option(
|
|
53
|
-
["--secret"],
|
|
54
|
-
type=str,
|
|
55
|
-
default=None,
|
|
56
|
-
help="If not anonymous, use this secret access key, if specified.",
|
|
57
|
-
),
|
|
58
|
-
click.Option(
|
|
59
|
-
["--token"],
|
|
60
|
-
type=str,
|
|
61
|
-
default=None,
|
|
62
|
-
help="If not anonymous, use this security token, if specified.",
|
|
63
|
-
),
|
|
64
|
-
]
|
|
65
|
-
return options
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
@dataclass
|
|
69
|
-
class S3UploaderConfig(FsspecCliUploaderConfig):
|
|
70
|
-
pass
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
s3_src_cmd = SrcCmd(
|
|
74
|
-
cmd_name=CONNECTOR_TYPE,
|
|
75
|
-
indexer_config=S3CliIndexerConfig,
|
|
76
|
-
connection_config=S3CliConnectionConfig,
|
|
77
|
-
downloader_config=S3CliDownloadConfig,
|
|
78
|
-
)
|
|
79
|
-
|
|
80
|
-
s3_dest_cmd = DestCmd(
|
|
81
|
-
cmd_name=CONNECTOR_TYPE,
|
|
82
|
-
connection_config=S3CliConnectionConfig,
|
|
83
|
-
uploader_config=S3UploaderConfig,
|
|
84
|
-
)
|
|
@@ -1,80 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.base import DestCmd, SrcCmd
|
|
6
|
-
from unstructured_ingest.v2.cli.cmds.fsspec.fsspec import (
|
|
7
|
-
FsspecCliDownloadConfig,
|
|
8
|
-
FsspecCliIndexerConfig,
|
|
9
|
-
FsspecCliUploaderConfig,
|
|
10
|
-
)
|
|
11
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
12
|
-
from unstructured_ingest.v2.processes.connectors.fsspec.sftp import (
|
|
13
|
-
CONNECTOR_TYPE,
|
|
14
|
-
)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
@dataclass
|
|
18
|
-
class SftpCliDownloadConfig(FsspecCliDownloadConfig):
|
|
19
|
-
pass
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
@dataclass
|
|
23
|
-
class SftpCliIndexerConfig(FsspecCliIndexerConfig):
|
|
24
|
-
pass
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
@dataclass
|
|
28
|
-
class SftpCliConnectionConfig(CliConfig):
|
|
29
|
-
@staticmethod
|
|
30
|
-
def get_cli_options() -> list[click.Option]:
|
|
31
|
-
options = [
|
|
32
|
-
click.Option(
|
|
33
|
-
["--username"],
|
|
34
|
-
required=True,
|
|
35
|
-
type=str,
|
|
36
|
-
help="Username for sftp connection",
|
|
37
|
-
),
|
|
38
|
-
click.Option(
|
|
39
|
-
["--password"],
|
|
40
|
-
required=True,
|
|
41
|
-
type=str,
|
|
42
|
-
help="Password for sftp connection",
|
|
43
|
-
),
|
|
44
|
-
click.Option(
|
|
45
|
-
["--look-for-keys"],
|
|
46
|
-
required=False,
|
|
47
|
-
default=False,
|
|
48
|
-
is_flag=True,
|
|
49
|
-
type=bool,
|
|
50
|
-
help="Whether to search for private key files in ~/.ssh/",
|
|
51
|
-
),
|
|
52
|
-
click.Option(
|
|
53
|
-
["--allow-agent"],
|
|
54
|
-
required=False,
|
|
55
|
-
default=False,
|
|
56
|
-
is_flag=True,
|
|
57
|
-
type=bool,
|
|
58
|
-
help="Whether to connect to the SSH agent.",
|
|
59
|
-
),
|
|
60
|
-
]
|
|
61
|
-
return options
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
@dataclass
|
|
65
|
-
class SftpUploaderConfig(FsspecCliUploaderConfig):
|
|
66
|
-
pass
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
sftp_src_cmd = SrcCmd(
|
|
70
|
-
cmd_name=CONNECTOR_TYPE,
|
|
71
|
-
indexer_config=SftpCliIndexerConfig,
|
|
72
|
-
connection_config=SftpCliConnectionConfig,
|
|
73
|
-
downloader_config=SftpCliDownloadConfig,
|
|
74
|
-
)
|
|
75
|
-
|
|
76
|
-
sftp_dest_cmd = DestCmd(
|
|
77
|
-
cmd_name=CONNECTOR_TYPE,
|
|
78
|
-
connection_config=SftpCliConnectionConfig,
|
|
79
|
-
uploader_config=SftpUploaderConfig,
|
|
80
|
-
)
|
|
@@ -1,74 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.base import SrcCmd
|
|
6
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
7
|
-
from unstructured_ingest.v2.cli.utils import DelimitedString, FileOrJson
|
|
8
|
-
from unstructured_ingest.v2.processes.connectors.google_drive import CONNECTOR_TYPE
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
@dataclass
|
|
12
|
-
class GoogleDriveCliConnectionConfig(CliConfig):
|
|
13
|
-
@staticmethod
|
|
14
|
-
def get_cli_options() -> list[click.Option]:
|
|
15
|
-
options = [
|
|
16
|
-
click.Option(
|
|
17
|
-
["--drive-id"],
|
|
18
|
-
required=True,
|
|
19
|
-
type=str,
|
|
20
|
-
help="Google Drive File or Folder ID.",
|
|
21
|
-
),
|
|
22
|
-
click.Option(
|
|
23
|
-
["--service-account-key"],
|
|
24
|
-
required=True,
|
|
25
|
-
type=FileOrJson(),
|
|
26
|
-
help="Either the file path of the credentials file to use or a json string of "
|
|
27
|
-
"those values to use for authentication",
|
|
28
|
-
),
|
|
29
|
-
]
|
|
30
|
-
return options
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
@dataclass
|
|
34
|
-
class GoogleDriveCliIndexerConfig(CliConfig):
|
|
35
|
-
@staticmethod
|
|
36
|
-
def get_cli_options() -> list[click.Option]:
|
|
37
|
-
options = [
|
|
38
|
-
click.Option(
|
|
39
|
-
["--extensions"],
|
|
40
|
-
default=None,
|
|
41
|
-
type=DelimitedString(),
|
|
42
|
-
help="Filters the files to be processed based on extension e.g. jpg, docx, etc.",
|
|
43
|
-
),
|
|
44
|
-
click.Option(
|
|
45
|
-
["--recursive"],
|
|
46
|
-
is_flag=True,
|
|
47
|
-
default=False,
|
|
48
|
-
help="Recursively download files in their respective folders "
|
|
49
|
-
"otherwise stop at the files in provided folder level.",
|
|
50
|
-
),
|
|
51
|
-
]
|
|
52
|
-
return options
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
@dataclass
|
|
56
|
-
class GoogleDriveCliDownloadConfig(CliConfig):
|
|
57
|
-
@staticmethod
|
|
58
|
-
def get_cli_options() -> list[click.Option]:
|
|
59
|
-
options = [
|
|
60
|
-
click.Option(
|
|
61
|
-
["--download-dir"],
|
|
62
|
-
help="Where files are downloaded to, defaults to a location at"
|
|
63
|
-
"`$HOME/.cache/unstructured_ingest/<connector name>/<SHA256>`.",
|
|
64
|
-
),
|
|
65
|
-
]
|
|
66
|
-
return options
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
google_drive_src_cmd = SrcCmd(
|
|
70
|
-
cmd_name=CONNECTOR_TYPE,
|
|
71
|
-
connection_config=GoogleDriveCliConnectionConfig,
|
|
72
|
-
indexer_config=GoogleDriveCliIndexerConfig,
|
|
73
|
-
downloader_config=GoogleDriveCliDownloadConfig,
|
|
74
|
-
)
|
|
@@ -1,52 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.base import DestCmd, SrcCmd
|
|
6
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
7
|
-
from unstructured_ingest.v2.processes.connectors.local import CONNECTOR_TYPE
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
@dataclass
|
|
11
|
-
class LocalCliIndexerConfig(CliConfig):
|
|
12
|
-
@staticmethod
|
|
13
|
-
def get_cli_options() -> list[click.Option]:
|
|
14
|
-
options = [
|
|
15
|
-
click.Option(
|
|
16
|
-
["--input-path"],
|
|
17
|
-
required=True,
|
|
18
|
-
type=click.Path(file_okay=True, dir_okay=True, exists=True),
|
|
19
|
-
help="Path to the location in the local file system that will be processed.",
|
|
20
|
-
),
|
|
21
|
-
click.Option(
|
|
22
|
-
["--recursive"],
|
|
23
|
-
is_flag=True,
|
|
24
|
-
default=False,
|
|
25
|
-
help="Recursively download files in their respective folders "
|
|
26
|
-
"otherwise stop at the files in provided folder level.",
|
|
27
|
-
),
|
|
28
|
-
]
|
|
29
|
-
return options
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
@dataclass
|
|
33
|
-
class LocalCliUploaderConfig(CliConfig):
|
|
34
|
-
@staticmethod
|
|
35
|
-
def get_cli_options() -> list[click.Option]:
|
|
36
|
-
options = [
|
|
37
|
-
click.Option(
|
|
38
|
-
["--output-dir"],
|
|
39
|
-
required=True,
|
|
40
|
-
type=str,
|
|
41
|
-
help="Local path to write partitioned output to",
|
|
42
|
-
)
|
|
43
|
-
]
|
|
44
|
-
return options
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
local_src_cmd = SrcCmd(
|
|
48
|
-
cmd_name=CONNECTOR_TYPE,
|
|
49
|
-
indexer_config=LocalCliIndexerConfig,
|
|
50
|
-
)
|
|
51
|
-
|
|
52
|
-
local_dest_cmd = DestCmd(cmd_name=CONNECTOR_TYPE, uploader_config=LocalCliUploaderConfig)
|
|
@@ -1,72 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.base import DestCmd
|
|
6
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
7
|
-
from unstructured_ingest.v2.processes.connectors.milvus import CONNECTOR_TYPE
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
@dataclass
|
|
11
|
-
class MilvusCliConnectionConfig(CliConfig):
|
|
12
|
-
@staticmethod
|
|
13
|
-
def get_cli_options() -> list[click.Option]:
|
|
14
|
-
options = [
|
|
15
|
-
click.Option(
|
|
16
|
-
["--uri"],
|
|
17
|
-
required=False,
|
|
18
|
-
type=str,
|
|
19
|
-
default=None,
|
|
20
|
-
help="Milvus uri, eg 'http://localhost:19530",
|
|
21
|
-
),
|
|
22
|
-
click.Option(
|
|
23
|
-
["--user"],
|
|
24
|
-
required=False,
|
|
25
|
-
type=str,
|
|
26
|
-
default=None,
|
|
27
|
-
help="Milvus user",
|
|
28
|
-
),
|
|
29
|
-
click.Option(
|
|
30
|
-
["--password"],
|
|
31
|
-
required=False,
|
|
32
|
-
type=str,
|
|
33
|
-
default=None,
|
|
34
|
-
help="Milvus password",
|
|
35
|
-
),
|
|
36
|
-
click.Option(
|
|
37
|
-
["--db-name"],
|
|
38
|
-
required=False,
|
|
39
|
-
type=str,
|
|
40
|
-
default=None,
|
|
41
|
-
help="Milvus database name",
|
|
42
|
-
),
|
|
43
|
-
]
|
|
44
|
-
return options
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
@dataclass
|
|
48
|
-
class MilvusCliUploaderConfig(CliConfig):
|
|
49
|
-
@staticmethod
|
|
50
|
-
def get_cli_options() -> list[click.Option]:
|
|
51
|
-
options = [
|
|
52
|
-
click.Option(
|
|
53
|
-
["--collection-name"],
|
|
54
|
-
required=True,
|
|
55
|
-
type=str,
|
|
56
|
-
help="Milvus collections to write to",
|
|
57
|
-
),
|
|
58
|
-
click.Option(
|
|
59
|
-
["--num-of-processes"],
|
|
60
|
-
type=click.IntRange(min=1),
|
|
61
|
-
default=4,
|
|
62
|
-
help="number of processes to use when writing to support parallel writes",
|
|
63
|
-
),
|
|
64
|
-
]
|
|
65
|
-
return options
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
milvus_dest_cmd = DestCmd(
|
|
69
|
-
cmd_name=CONNECTOR_TYPE,
|
|
70
|
-
connection_config=MilvusCliConnectionConfig,
|
|
71
|
-
uploader_config=MilvusCliUploaderConfig,
|
|
72
|
-
)
|
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.base import DestCmd
|
|
6
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
7
|
-
from unstructured_ingest.v2.processes.connectors.mongodb import CONNECTOR_TYPE
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
@dataclass
|
|
11
|
-
class MongoDBCliConnectionConfig(CliConfig):
|
|
12
|
-
@staticmethod
|
|
13
|
-
def get_cli_options() -> list[click.Option]:
|
|
14
|
-
options = [
|
|
15
|
-
click.Option(
|
|
16
|
-
["--uri"],
|
|
17
|
-
help="URI to user when connecting",
|
|
18
|
-
),
|
|
19
|
-
click.Option(
|
|
20
|
-
["--host"],
|
|
21
|
-
help="hostname or IP address or Unix domain socket path of a single mongod or "
|
|
22
|
-
"mongos instance to connect to, or a list of hostnames",
|
|
23
|
-
),
|
|
24
|
-
click.Option(["--port"], type=int, default=27017),
|
|
25
|
-
click.Option(
|
|
26
|
-
["--database"], type=str, required=True, help="database name to connect to"
|
|
27
|
-
),
|
|
28
|
-
click.Option(
|
|
29
|
-
["--collection"], required=True, type=str, help="collection name to connect to"
|
|
30
|
-
),
|
|
31
|
-
]
|
|
32
|
-
return options
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
@dataclass
|
|
36
|
-
class MongoDBCliUploaderConfig(CliConfig):
|
|
37
|
-
@staticmethod
|
|
38
|
-
def get_cli_options() -> list[click.Option]:
|
|
39
|
-
options = [
|
|
40
|
-
click.Option(
|
|
41
|
-
["--batch-size"],
|
|
42
|
-
default=100,
|
|
43
|
-
type=int,
|
|
44
|
-
help="Number of records per batch",
|
|
45
|
-
)
|
|
46
|
-
]
|
|
47
|
-
return options
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
@dataclass
|
|
51
|
-
class MongoDBCliUploadStagerConfig(CliConfig):
|
|
52
|
-
@staticmethod
|
|
53
|
-
def get_cli_options() -> list[click.Option]:
|
|
54
|
-
return []
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
mongodb_dest_cmd = DestCmd(
|
|
58
|
-
cmd_name=CONNECTOR_TYPE,
|
|
59
|
-
connection_config=MongoDBCliConnectionConfig,
|
|
60
|
-
uploader_config=MongoDBCliUploaderConfig,
|
|
61
|
-
upload_stager_config=MongoDBCliUploadStagerConfig,
|
|
62
|
-
)
|
|
@@ -1,91 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.base import SrcCmd
|
|
6
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
7
|
-
from unstructured_ingest.v2.processes.connectors.onedrive import CONNECTOR_TYPE
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
@dataclass
|
|
11
|
-
class OnedriveCliConnectionConfig(CliConfig):
|
|
12
|
-
@staticmethod
|
|
13
|
-
def get_cli_options() -> list[click.Option]:
|
|
14
|
-
options = [
|
|
15
|
-
click.Option(
|
|
16
|
-
["--client-id"],
|
|
17
|
-
required=True,
|
|
18
|
-
type=str,
|
|
19
|
-
help="Microsoft app client ID",
|
|
20
|
-
),
|
|
21
|
-
click.Option(
|
|
22
|
-
["--client-cred"],
|
|
23
|
-
required=True,
|
|
24
|
-
type=str,
|
|
25
|
-
help="Microsoft App client secret",
|
|
26
|
-
),
|
|
27
|
-
click.Option(
|
|
28
|
-
["--user-pname"],
|
|
29
|
-
required=True,
|
|
30
|
-
type=str,
|
|
31
|
-
help="User principal name, usually is your Azure AD email.",
|
|
32
|
-
),
|
|
33
|
-
click.Option(
|
|
34
|
-
["--tenant"],
|
|
35
|
-
default="common",
|
|
36
|
-
type=str,
|
|
37
|
-
help="ID or domain name associated with your Azure AD instance",
|
|
38
|
-
),
|
|
39
|
-
click.Option(
|
|
40
|
-
["--authority-url"],
|
|
41
|
-
default="https://login.microsoftonline.com",
|
|
42
|
-
type=str,
|
|
43
|
-
help="Authentication token provider for Microsoft apps, default is "
|
|
44
|
-
"https://login.microsoftonline.com",
|
|
45
|
-
),
|
|
46
|
-
]
|
|
47
|
-
return options
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
@dataclass
|
|
51
|
-
class OnedriveCliIndexerConfig(CliConfig):
|
|
52
|
-
@staticmethod
|
|
53
|
-
def get_cli_options() -> list[click.Option]:
|
|
54
|
-
options = [
|
|
55
|
-
click.Option(
|
|
56
|
-
["--path"],
|
|
57
|
-
default=None,
|
|
58
|
-
type=str,
|
|
59
|
-
help="Folder to start parsing files from.",
|
|
60
|
-
),
|
|
61
|
-
click.Option(
|
|
62
|
-
["--recursive"],
|
|
63
|
-
is_flag=True,
|
|
64
|
-
default=False,
|
|
65
|
-
help="Recursively download files in their respective folders "
|
|
66
|
-
"otherwise stop at the files in provided folder level.",
|
|
67
|
-
),
|
|
68
|
-
]
|
|
69
|
-
return options
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
@dataclass
|
|
73
|
-
class OnedriveCliDownloadConfig(CliConfig):
|
|
74
|
-
@staticmethod
|
|
75
|
-
def get_cli_options() -> list[click.Option]:
|
|
76
|
-
options = [
|
|
77
|
-
click.Option(
|
|
78
|
-
["--download-dir"],
|
|
79
|
-
help="Where files are downloaded to, defaults to a location at"
|
|
80
|
-
"`$HOME/.cache/unstructured_ingest/<connector name>/<SHA256>`.",
|
|
81
|
-
),
|
|
82
|
-
]
|
|
83
|
-
return options
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
onedrive_drive_src_cmd = SrcCmd(
|
|
87
|
-
cmd_name=CONNECTOR_TYPE,
|
|
88
|
-
connection_config=OnedriveCliConnectionConfig,
|
|
89
|
-
indexer_config=OnedriveCliIndexerConfig,
|
|
90
|
-
downloader_config=OnedriveCliDownloadConfig,
|
|
91
|
-
)
|
|
@@ -1,93 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import click
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.v2.cli.base import DestCmd, SrcCmd
|
|
6
|
-
from unstructured_ingest.v2.cli.cmds.elasticsearch import (
|
|
7
|
-
ElasticsearchCliDownloadConfig,
|
|
8
|
-
ElasticsearchCliIndexerConfig,
|
|
9
|
-
ElasticsearchCliUploadStagerConfig,
|
|
10
|
-
ElasticsearchUploaderConfig,
|
|
11
|
-
)
|
|
12
|
-
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
13
|
-
from unstructured_ingest.v2.cli.utils import DelimitedString
|
|
14
|
-
from unstructured_ingest.v2.processes.connectors.opensearch import CONNECTOR_TYPE
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
@dataclass
|
|
18
|
-
class OpenSearchCliConnectionConfig(CliConfig):
|
|
19
|
-
@staticmethod
|
|
20
|
-
def get_cli_options() -> list[click.Option]:
|
|
21
|
-
options = [
|
|
22
|
-
click.Option(
|
|
23
|
-
["--hosts"],
|
|
24
|
-
type=DelimitedString(),
|
|
25
|
-
help='List of the OpenSearch hosts to connect to, e.g. "http://localhost:9200"',
|
|
26
|
-
),
|
|
27
|
-
click.Option(
|
|
28
|
-
["--username"], type=str, default=None, help="username when using basic auth"
|
|
29
|
-
),
|
|
30
|
-
click.Option(
|
|
31
|
-
["--password"],
|
|
32
|
-
type=str,
|
|
33
|
-
default=None,
|
|
34
|
-
help="password when using basic auth",
|
|
35
|
-
),
|
|
36
|
-
click.Option(
|
|
37
|
-
["--use-ssl"],
|
|
38
|
-
type=bool,
|
|
39
|
-
default=False,
|
|
40
|
-
is_flag=True,
|
|
41
|
-
help="use ssl for the connection",
|
|
42
|
-
),
|
|
43
|
-
click.Option(
|
|
44
|
-
["--verify-certs"],
|
|
45
|
-
type=bool,
|
|
46
|
-
default=False,
|
|
47
|
-
is_flag=True,
|
|
48
|
-
help="whether to verify SSL certificates",
|
|
49
|
-
),
|
|
50
|
-
click.Option(
|
|
51
|
-
["--ssl-show-warn"],
|
|
52
|
-
type=bool,
|
|
53
|
-
default=False,
|
|
54
|
-
is_flag=True,
|
|
55
|
-
help="show warning when verify certs is disabled",
|
|
56
|
-
),
|
|
57
|
-
click.Option(
|
|
58
|
-
["--ca-certs"],
|
|
59
|
-
type=click.Path(),
|
|
60
|
-
default=None,
|
|
61
|
-
help="path to CA bundle",
|
|
62
|
-
),
|
|
63
|
-
click.Option(
|
|
64
|
-
["--client-cert"],
|
|
65
|
-
type=click.Path(),
|
|
66
|
-
default=None,
|
|
67
|
-
help="path to the file containing the private key and the certificate,"
|
|
68
|
-
" or cert only if using client_key",
|
|
69
|
-
),
|
|
70
|
-
click.Option(
|
|
71
|
-
["--client-key"],
|
|
72
|
-
type=click.Path(),
|
|
73
|
-
default=None,
|
|
74
|
-
help="path to the file containing the private key"
|
|
75
|
-
" if using separate cert and key files",
|
|
76
|
-
),
|
|
77
|
-
]
|
|
78
|
-
return options
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
opensearch_src_cmd = SrcCmd(
|
|
82
|
-
cmd_name=CONNECTOR_TYPE,
|
|
83
|
-
connection_config=OpenSearchCliConnectionConfig,
|
|
84
|
-
indexer_config=ElasticsearchCliIndexerConfig,
|
|
85
|
-
downloader_config=ElasticsearchCliDownloadConfig,
|
|
86
|
-
)
|
|
87
|
-
|
|
88
|
-
opensearch_dest_cmd = DestCmd(
|
|
89
|
-
cmd_name=CONNECTOR_TYPE,
|
|
90
|
-
connection_config=OpenSearchCliConnectionConfig,
|
|
91
|
-
upload_stager_config=ElasticsearchCliUploadStagerConfig,
|
|
92
|
-
uploader_config=ElasticsearchUploaderConfig,
|
|
93
|
-
)
|