camel-ai 0.1.6.2__py3-none-any.whl → 0.1.6.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of camel-ai might be problematic. Click here for more details.
- camel/__init__.py +1 -1
- camel/interpreters/docker_interpreter.py +1 -1
- camel/loaders/__init__.py +1 -2
- camel/loaders/base_io.py +118 -52
- camel/loaders/jina_url_reader.py +6 -6
- camel/loaders/unstructured_io.py +24 -286
- camel/retrievers/auto_retriever.py +25 -35
- camel/retrievers/vector_retriever.py +20 -18
- camel/storages/object_storages/__init__.py +22 -0
- camel/storages/object_storages/amazon_s3.py +205 -0
- camel/storages/object_storages/azure_blob.py +166 -0
- camel/storages/object_storages/base.py +115 -0
- camel/storages/object_storages/google_cloud.py +152 -0
- camel/toolkits/retrieval_toolkit.py +5 -5
- camel/toolkits/search_toolkit.py +4 -4
- {camel_ai-0.1.6.2.dist-info → camel_ai-0.1.6.3.dist-info}/METADATA +7 -3
- {camel_ai-0.1.6.2.dist-info → camel_ai-0.1.6.3.dist-info}/RECORD +18 -13
- {camel_ai-0.1.6.2.dist-info → camel_ai-0.1.6.3.dist-info}/WHEEL +0 -0
camel/loaders/unstructured_io.py
CHANGED
|
@@ -12,12 +12,18 @@
|
|
|
12
12
|
# limitations under the License.
|
|
13
13
|
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
|
|
14
14
|
import uuid
|
|
15
|
-
from typing import
|
|
15
|
+
from typing import (
|
|
16
|
+
Any,
|
|
17
|
+
Dict,
|
|
18
|
+
List,
|
|
19
|
+
Literal,
|
|
20
|
+
Optional,
|
|
21
|
+
Tuple,
|
|
22
|
+
Union,
|
|
23
|
+
)
|
|
16
24
|
|
|
17
25
|
from unstructured.documents.elements import Element
|
|
18
26
|
|
|
19
|
-
from camel.utils import dependencies_required
|
|
20
|
-
|
|
21
27
|
|
|
22
28
|
class UnstructuredIO:
|
|
23
29
|
r"""A class to handle various functionalities provided by the
|
|
@@ -25,56 +31,12 @@ class UnstructuredIO:
|
|
|
25
31
|
extracting, staging, chunking data, and integrating with cloud
|
|
26
32
|
services like S3 and Azure for data connection.
|
|
27
33
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
the Unstructured library.
|
|
34
|
+
References:
|
|
35
|
+
https://docs.unstructured.io/
|
|
31
36
|
"""
|
|
32
37
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def __init__(self):
|
|
36
|
-
r"""Initializes the UnstructuredIO class and ensures the
|
|
37
|
-
installed version of Unstructured library meets the minimum
|
|
38
|
-
requirements.
|
|
39
|
-
"""
|
|
40
|
-
self._ensure_unstructured_version(self.UNSTRUCTURED_MIN_VERSION)
|
|
41
|
-
|
|
42
|
-
@dependencies_required('unstructured')
|
|
43
|
-
def _ensure_unstructured_version(self, min_version: str) -> None:
|
|
44
|
-
r"""Validates that the installed 'Unstructured' library version
|
|
45
|
-
satisfies the specified minimum version requirement. This function is
|
|
46
|
-
essential for ensuring compatibility with features that depend on a
|
|
47
|
-
certain version of the 'Unstructured' package.
|
|
48
|
-
|
|
49
|
-
Args:
|
|
50
|
-
min_version (str): The minimum version required, specified in
|
|
51
|
-
`'major.minor.patch'` format.
|
|
52
|
-
|
|
53
|
-
Raises:
|
|
54
|
-
ImportError: If the 'Unstructured' package is not available in the
|
|
55
|
-
environment.
|
|
56
|
-
ValueError: If the current `'Unstructured'` version is older than
|
|
57
|
-
the required minimum version.
|
|
58
|
-
|
|
59
|
-
Notes:
|
|
60
|
-
Uses the 'packaging.version' module to parse and compare version
|
|
61
|
-
strings.
|
|
62
|
-
"""
|
|
63
|
-
from packaging import version
|
|
64
|
-
from unstructured.__version__ import __version__
|
|
65
|
-
|
|
66
|
-
# Use packaging.version to compare versions
|
|
67
|
-
min_ver = version.parse(min_version)
|
|
68
|
-
installed_ver = version.parse(__version__)
|
|
69
|
-
|
|
70
|
-
if installed_ver < min_ver:
|
|
71
|
-
raise ValueError(
|
|
72
|
-
f"Require `unstructured>={min_version}`, "
|
|
73
|
-
f"you have {__version__}."
|
|
74
|
-
)
|
|
75
|
-
|
|
38
|
+
@staticmethod
|
|
76
39
|
def create_element_from_text(
|
|
77
|
-
self,
|
|
78
40
|
text: str,
|
|
79
41
|
element_id: Optional[Union[str, uuid.UUID]] = None,
|
|
80
42
|
embeddings: Optional[List[float]] = None,
|
|
@@ -89,8 +51,8 @@ class UnstructuredIO:
|
|
|
89
51
|
|
|
90
52
|
Args:
|
|
91
53
|
text (str): The text content for the element.
|
|
92
|
-
element_id (Union[str, uuid.UUID], optional): Unique
|
|
93
|
-
|
|
54
|
+
element_id (Optional[Union[str, uuid.UUID]], optional): Unique
|
|
55
|
+
identifier for the element. Defaults to `None`.
|
|
94
56
|
embeddings (Optional[List[float]], optional): A list of float
|
|
95
57
|
numbers representing the text embeddings. Defaults to `None`.
|
|
96
58
|
filename (Optional[str], optional): The name of the file the
|
|
@@ -120,13 +82,13 @@ class UnstructuredIO:
|
|
|
120
82
|
|
|
121
83
|
return Text(
|
|
122
84
|
text=text,
|
|
123
|
-
element_id=element_id
|
|
85
|
+
element_id=element_id or uuid.uuid4(),
|
|
124
86
|
metadata=metadata,
|
|
125
87
|
embeddings=embeddings,
|
|
126
88
|
)
|
|
127
89
|
|
|
90
|
+
@staticmethod
|
|
128
91
|
def parse_file_or_url(
|
|
129
|
-
self,
|
|
130
92
|
input_path: str,
|
|
131
93
|
**kwargs: Any,
|
|
132
94
|
) -> List[Element]:
|
|
@@ -189,8 +151,8 @@ class UnstructuredIO:
|
|
|
189
151
|
"Failed to parse the unstructured file."
|
|
190
152
|
) from e
|
|
191
153
|
|
|
154
|
+
@staticmethod
|
|
192
155
|
def clean_text_data(
|
|
193
|
-
self,
|
|
194
156
|
text: str,
|
|
195
157
|
clean_options: Optional[List[Tuple[str, Dict[str, Any]]]] = None,
|
|
196
158
|
) -> str:
|
|
@@ -253,7 +215,7 @@ class UnstructuredIO:
|
|
|
253
215
|
)
|
|
254
216
|
from unstructured.cleaners.translate import translate_text
|
|
255
217
|
|
|
256
|
-
cleaning_functions = {
|
|
218
|
+
cleaning_functions: Any = {
|
|
257
219
|
"clean_extra_whitespace": clean_extra_whitespace,
|
|
258
220
|
"clean_bullets": clean_bullets,
|
|
259
221
|
"clean_ordered_bullets": clean_ordered_bullets,
|
|
@@ -291,8 +253,8 @@ class UnstructuredIO:
|
|
|
291
253
|
|
|
292
254
|
return cleaned_text
|
|
293
255
|
|
|
256
|
+
@staticmethod
|
|
294
257
|
def extract_data_from_text(
|
|
295
|
-
self,
|
|
296
258
|
text: str,
|
|
297
259
|
extract_type: Literal[
|
|
298
260
|
'extract_datetimetz',
|
|
@@ -340,7 +302,7 @@ class UnstructuredIO:
|
|
|
340
302
|
extract_us_phone_number,
|
|
341
303
|
)
|
|
342
304
|
|
|
343
|
-
extraction_functions = {
|
|
305
|
+
extraction_functions: Any = {
|
|
344
306
|
"extract_datetimetz": extract_datetimetz,
|
|
345
307
|
"extract_email_address": extract_email_address,
|
|
346
308
|
"extract_ip_address": extract_ip_address,
|
|
@@ -357,8 +319,8 @@ class UnstructuredIO:
|
|
|
357
319
|
|
|
358
320
|
return extraction_functions[extract_type](text, **kwargs)
|
|
359
321
|
|
|
322
|
+
@staticmethod
|
|
360
323
|
def stage_elements(
|
|
361
|
-
self,
|
|
362
324
|
elements: List[Any],
|
|
363
325
|
stage_type: Literal[
|
|
364
326
|
'convert_to_csv',
|
|
@@ -416,7 +378,7 @@ class UnstructuredIO:
|
|
|
416
378
|
weaviate,
|
|
417
379
|
)
|
|
418
380
|
|
|
419
|
-
staging_functions = {
|
|
381
|
+
staging_functions: Any = {
|
|
420
382
|
"convert_to_csv": base.convert_to_csv,
|
|
421
383
|
"convert_to_dataframe": base.convert_to_dataframe,
|
|
422
384
|
"convert_to_dict": base.convert_to_dict,
|
|
@@ -441,8 +403,9 @@ class UnstructuredIO:
|
|
|
441
403
|
|
|
442
404
|
return staging_functions[stage_type](elements, **kwargs)
|
|
443
405
|
|
|
406
|
+
@staticmethod
|
|
444
407
|
def chunk_elements(
|
|
445
|
-
|
|
408
|
+
elements: List[Any], chunk_type: str, **kwargs
|
|
446
409
|
) -> List[Element]:
|
|
447
410
|
r"""Chunks elements by titles.
|
|
448
411
|
|
|
@@ -470,228 +433,3 @@ class UnstructuredIO:
|
|
|
470
433
|
|
|
471
434
|
# Format chunks into a list of dictionaries (or your preferred format)
|
|
472
435
|
return chunking_functions[chunk_type](elements, **kwargs)
|
|
473
|
-
|
|
474
|
-
def run_s3_ingest(
|
|
475
|
-
self,
|
|
476
|
-
s3_url: str,
|
|
477
|
-
output_dir: str,
|
|
478
|
-
num_processes: int = 2,
|
|
479
|
-
anonymous: bool = True,
|
|
480
|
-
) -> None:
|
|
481
|
-
r"""Processes documents from an S3 bucket and stores structured
|
|
482
|
-
outputs locally.
|
|
483
|
-
|
|
484
|
-
Args:
|
|
485
|
-
s3_url (str): The URL of the S3 bucket.
|
|
486
|
-
output_dir (str): Local directory to store the processed outputs.
|
|
487
|
-
num_processes (int, optional): Number of processes to use.
|
|
488
|
-
(default: :obj:`2`)
|
|
489
|
-
anonymous (bool, optional): Flag to run anonymously if
|
|
490
|
-
required. (default: :obj:`True`)
|
|
491
|
-
|
|
492
|
-
Notes:
|
|
493
|
-
You need to install the necessary extras by using:
|
|
494
|
-
`pip install "unstructured[s3]"`.
|
|
495
|
-
|
|
496
|
-
References:
|
|
497
|
-
https://unstructured-io.github.io/unstructured/
|
|
498
|
-
"""
|
|
499
|
-
|
|
500
|
-
from unstructured.ingest.interfaces import (
|
|
501
|
-
FsspecConfig,
|
|
502
|
-
PartitionConfig,
|
|
503
|
-
ProcessorConfig,
|
|
504
|
-
ReadConfig,
|
|
505
|
-
)
|
|
506
|
-
from unstructured.ingest.runner import S3Runner
|
|
507
|
-
|
|
508
|
-
runner = S3Runner(
|
|
509
|
-
processor_config=ProcessorConfig(
|
|
510
|
-
verbose=True,
|
|
511
|
-
output_dir=output_dir,
|
|
512
|
-
num_processes=num_processes,
|
|
513
|
-
),
|
|
514
|
-
read_config=ReadConfig(),
|
|
515
|
-
partition_config=PartitionConfig(),
|
|
516
|
-
fsspec_config=FsspecConfig(remote_url=s3_url),
|
|
517
|
-
)
|
|
518
|
-
runner.run(anonymous=anonymous)
|
|
519
|
-
|
|
520
|
-
def run_azure_ingest(
|
|
521
|
-
self,
|
|
522
|
-
azure_url: str,
|
|
523
|
-
output_dir: str,
|
|
524
|
-
account_name: str,
|
|
525
|
-
num_processes: int = 2,
|
|
526
|
-
) -> None:
|
|
527
|
-
r"""Processes documents from an Azure storage container and stores
|
|
528
|
-
structured outputs locally.
|
|
529
|
-
|
|
530
|
-
Args:
|
|
531
|
-
azure_url (str): The URL of the Azure storage container.
|
|
532
|
-
output_dir (str): Local directory to store the processed outputs.
|
|
533
|
-
account_name (str): Azure account name for accessing the container.
|
|
534
|
-
num_processes (int, optional): Number of processes to use.
|
|
535
|
-
(default: :obj:`2`)
|
|
536
|
-
|
|
537
|
-
Notes:
|
|
538
|
-
You need to install the necessary extras by using:
|
|
539
|
-
`pip install "unstructured[azure]"`.
|
|
540
|
-
|
|
541
|
-
References:
|
|
542
|
-
https://unstructured-io.github.io/unstructured/
|
|
543
|
-
"""
|
|
544
|
-
from unstructured.ingest.interfaces import (
|
|
545
|
-
FsspecConfig,
|
|
546
|
-
PartitionConfig,
|
|
547
|
-
ProcessorConfig,
|
|
548
|
-
ReadConfig,
|
|
549
|
-
)
|
|
550
|
-
from unstructured.ingest.runner import AzureRunner
|
|
551
|
-
|
|
552
|
-
runner = AzureRunner(
|
|
553
|
-
processor_config=ProcessorConfig(
|
|
554
|
-
verbose=True,
|
|
555
|
-
output_dir=output_dir,
|
|
556
|
-
num_processes=num_processes,
|
|
557
|
-
),
|
|
558
|
-
read_config=ReadConfig(),
|
|
559
|
-
partition_config=PartitionConfig(),
|
|
560
|
-
fsspec_config=FsspecConfig(remote_url=azure_url),
|
|
561
|
-
)
|
|
562
|
-
runner.run(account_name=account_name)
|
|
563
|
-
|
|
564
|
-
def run_github_ingest(
|
|
565
|
-
self,
|
|
566
|
-
repo_url: str,
|
|
567
|
-
git_branch: str,
|
|
568
|
-
output_dir: str,
|
|
569
|
-
num_processes: int = 2,
|
|
570
|
-
) -> None:
|
|
571
|
-
r"""Processes documents from a GitHub repository and stores
|
|
572
|
-
structured outputs locally.
|
|
573
|
-
|
|
574
|
-
Args:
|
|
575
|
-
repo_url (str): URL of the GitHub repository.
|
|
576
|
-
git_branch (str): Git branch name to process.
|
|
577
|
-
output_dir (str): Local directory to store the processed outputs.
|
|
578
|
-
num_processes (int, optional): Number of processes to use.
|
|
579
|
-
(default: :obj:`2`)
|
|
580
|
-
|
|
581
|
-
Notes:
|
|
582
|
-
You need to install the necessary extras by using:
|
|
583
|
-
`pip install "unstructured[github]"`.
|
|
584
|
-
|
|
585
|
-
References:
|
|
586
|
-
https://unstructured-io.github.io/unstructured/
|
|
587
|
-
"""
|
|
588
|
-
from unstructured.ingest.interfaces import (
|
|
589
|
-
PartitionConfig,
|
|
590
|
-
ProcessorConfig,
|
|
591
|
-
ReadConfig,
|
|
592
|
-
)
|
|
593
|
-
from unstructured.ingest.runner import GithubRunner
|
|
594
|
-
|
|
595
|
-
runner = GithubRunner(
|
|
596
|
-
processor_config=ProcessorConfig(
|
|
597
|
-
verbose=True,
|
|
598
|
-
output_dir=output_dir,
|
|
599
|
-
num_processes=num_processes,
|
|
600
|
-
),
|
|
601
|
-
read_config=ReadConfig(),
|
|
602
|
-
partition_config=PartitionConfig(),
|
|
603
|
-
)
|
|
604
|
-
runner.run(url=repo_url, git_branch=git_branch)
|
|
605
|
-
|
|
606
|
-
def run_slack_ingest(
|
|
607
|
-
self,
|
|
608
|
-
channels: List[str],
|
|
609
|
-
token: str,
|
|
610
|
-
start_date: str,
|
|
611
|
-
end_date: str,
|
|
612
|
-
output_dir: str,
|
|
613
|
-
num_processes: int = 2,
|
|
614
|
-
) -> None:
|
|
615
|
-
r"""Processes documents from specified Slack channels and stores
|
|
616
|
-
structured outputs locally.
|
|
617
|
-
|
|
618
|
-
Args:
|
|
619
|
-
channels (List[str]): List of Slack channel IDs.
|
|
620
|
-
token (str): Slack API token.
|
|
621
|
-
start_date (str): Start date for fetching data.
|
|
622
|
-
end_date (str): End date for fetching data.
|
|
623
|
-
output_dir (str): Local directory to store the processed outputs.
|
|
624
|
-
num_processes (int, optional): Number of processes to use.
|
|
625
|
-
(default: :obj:`2`)
|
|
626
|
-
|
|
627
|
-
Notes:
|
|
628
|
-
You need to install the necessary extras by using:
|
|
629
|
-
`pip install "unstructured[slack]"`.
|
|
630
|
-
|
|
631
|
-
References:
|
|
632
|
-
https://unstructured-io.github.io/unstructured/
|
|
633
|
-
"""
|
|
634
|
-
from unstructured.ingest.interfaces import (
|
|
635
|
-
PartitionConfig,
|
|
636
|
-
ProcessorConfig,
|
|
637
|
-
ReadConfig,
|
|
638
|
-
)
|
|
639
|
-
from unstructured.ingest.runner import SlackRunner
|
|
640
|
-
|
|
641
|
-
runner = SlackRunner(
|
|
642
|
-
processor_config=ProcessorConfig(
|
|
643
|
-
verbose=True,
|
|
644
|
-
output_dir=output_dir,
|
|
645
|
-
num_processes=num_processes,
|
|
646
|
-
),
|
|
647
|
-
read_config=ReadConfig(),
|
|
648
|
-
partition_config=PartitionConfig(),
|
|
649
|
-
)
|
|
650
|
-
runner.run(
|
|
651
|
-
channels=channels,
|
|
652
|
-
token=token,
|
|
653
|
-
start_date=start_date,
|
|
654
|
-
end_date=end_date,
|
|
655
|
-
)
|
|
656
|
-
|
|
657
|
-
def run_discord_ingest(
|
|
658
|
-
self,
|
|
659
|
-
channels: List[str],
|
|
660
|
-
token: str,
|
|
661
|
-
output_dir: str,
|
|
662
|
-
num_processes: int = 2,
|
|
663
|
-
) -> None:
|
|
664
|
-
r"""Processes messages from specified Discord channels and stores
|
|
665
|
-
structured outputs locally.
|
|
666
|
-
|
|
667
|
-
Args:
|
|
668
|
-
channels (List[str]): List of Discord channel IDs.
|
|
669
|
-
token (str): Discord bot token.
|
|
670
|
-
output_dir (str): Local directory to store the processed outputs.
|
|
671
|
-
num_processes (int, optional): Number of processes to use.
|
|
672
|
-
(default: :obj:`2`)
|
|
673
|
-
|
|
674
|
-
Notes:
|
|
675
|
-
You need to install the necessary extras by using:
|
|
676
|
-
`pip install "unstructured[discord]"`.
|
|
677
|
-
|
|
678
|
-
References:
|
|
679
|
-
https://unstructured-io.github.io/unstructured/
|
|
680
|
-
"""
|
|
681
|
-
from unstructured.ingest.interfaces import (
|
|
682
|
-
PartitionConfig,
|
|
683
|
-
ProcessorConfig,
|
|
684
|
-
ReadConfig,
|
|
685
|
-
)
|
|
686
|
-
from unstructured.ingest.runner import DiscordRunner
|
|
687
|
-
|
|
688
|
-
runner = DiscordRunner(
|
|
689
|
-
processor_config=ProcessorConfig(
|
|
690
|
-
verbose=True,
|
|
691
|
-
output_dir=output_dir,
|
|
692
|
-
num_processes=num_processes,
|
|
693
|
-
),
|
|
694
|
-
read_config=ReadConfig(),
|
|
695
|
-
partition_config=PartitionConfig(),
|
|
696
|
-
)
|
|
697
|
-
runner.run(channels=channels, token=token)
|
|
@@ -97,36 +97,36 @@ class AutoRetriever:
|
|
|
97
97
|
f"Unsupported vector storage type: {self.storage_type}"
|
|
98
98
|
)
|
|
99
99
|
|
|
100
|
-
def _collection_name_generator(self,
|
|
100
|
+
def _collection_name_generator(self, content: str) -> str:
|
|
101
101
|
r"""Generates a valid collection name from a given file path or URL.
|
|
102
102
|
|
|
103
103
|
Args:
|
|
104
|
-
|
|
105
|
-
generate the collection name.
|
|
104
|
+
contents (str): Local file path, remote URL or string content.
|
|
106
105
|
|
|
107
106
|
Returns:
|
|
108
107
|
str: A sanitized, valid collection name suitable for use.
|
|
109
108
|
"""
|
|
110
|
-
# Check
|
|
111
|
-
parsed_url = urlparse(
|
|
112
|
-
|
|
109
|
+
# Check if the content is URL
|
|
110
|
+
parsed_url = urlparse(content)
|
|
111
|
+
is_url = all([parsed_url.scheme, parsed_url.netloc])
|
|
113
112
|
|
|
114
113
|
# Convert given path into a collection name, ensuring it only
|
|
115
114
|
# contains numbers, letters, and underscores
|
|
116
|
-
if
|
|
115
|
+
if is_url:
|
|
117
116
|
# For URLs, remove https://, replace /, and any characters not
|
|
118
117
|
# allowed by Milvus with _
|
|
119
118
|
collection_name = re.sub(
|
|
120
119
|
r'[^0-9a-zA-Z]+',
|
|
121
120
|
'_',
|
|
122
|
-
|
|
121
|
+
content.replace("https://", ""),
|
|
123
122
|
)
|
|
124
|
-
|
|
123
|
+
elif os.path.exists(content):
|
|
125
124
|
# For file paths, get the stem and replace spaces with _, also
|
|
126
125
|
# ensuring only allowed characters are present
|
|
127
|
-
collection_name = re.sub(
|
|
128
|
-
|
|
129
|
-
|
|
126
|
+
collection_name = re.sub(r'[^0-9a-zA-Z]+', '_', Path(content).stem)
|
|
127
|
+
else:
|
|
128
|
+
# the content is string input
|
|
129
|
+
collection_name = content[:10]
|
|
130
130
|
|
|
131
131
|
# Ensure the collection name does not start or end with underscore
|
|
132
132
|
collection_name = collection_name.strip("_")
|
|
@@ -193,7 +193,7 @@ class AutoRetriever:
|
|
|
193
193
|
def run_vector_retriever(
|
|
194
194
|
self,
|
|
195
195
|
query: str,
|
|
196
|
-
|
|
196
|
+
contents: Union[str, List[str]],
|
|
197
197
|
top_k: int = DEFAULT_TOP_K_RESULTS,
|
|
198
198
|
similarity_threshold: float = DEFAULT_SIMILARITY_THRESHOLD,
|
|
199
199
|
return_detailed_info: bool = False,
|
|
@@ -203,8 +203,8 @@ class AutoRetriever:
|
|
|
203
203
|
|
|
204
204
|
Args:
|
|
205
205
|
query (str): Query string for information retriever.
|
|
206
|
-
|
|
207
|
-
|
|
206
|
+
contents (Union[str, List[str]]): Local file paths, remote URLs or
|
|
207
|
+
string contents.
|
|
208
208
|
top_k (int, optional): The number of top results to return during
|
|
209
209
|
retrieve. Must be a positive integer. Defaults to
|
|
210
210
|
`DEFAULT_TOP_K_RESULTS`.
|
|
@@ -223,24 +223,18 @@ class AutoRetriever:
|
|
|
223
223
|
Raises:
|
|
224
224
|
ValueError: If there's an vector storage existing with content
|
|
225
225
|
name in the vector path but the payload is None. If
|
|
226
|
-
`
|
|
226
|
+
`contents` is empty.
|
|
227
227
|
RuntimeError: If any errors occur during the retrieve process.
|
|
228
228
|
"""
|
|
229
|
-
if not
|
|
230
|
-
raise ValueError("
|
|
229
|
+
if not contents:
|
|
230
|
+
raise ValueError("content cannot be empty.")
|
|
231
231
|
|
|
232
|
-
|
|
233
|
-
[content_input_paths]
|
|
234
|
-
if isinstance(content_input_paths, str)
|
|
235
|
-
else content_input_paths
|
|
236
|
-
)
|
|
232
|
+
contents = [contents] if isinstance(contents, str) else contents
|
|
237
233
|
|
|
238
234
|
all_retrieved_info = []
|
|
239
|
-
for
|
|
235
|
+
for content in contents:
|
|
240
236
|
# Generate a valid collection name
|
|
241
|
-
collection_name = self._collection_name_generator(
|
|
242
|
-
content_input_path
|
|
243
|
-
)
|
|
237
|
+
collection_name = self._collection_name_generator(content)
|
|
244
238
|
try:
|
|
245
239
|
vector_storage_instance = self._initialize_vector_storage(
|
|
246
240
|
collection_name
|
|
@@ -251,13 +245,11 @@ class AutoRetriever:
|
|
|
251
245
|
file_is_modified = False # initialize with a default value
|
|
252
246
|
if (
|
|
253
247
|
vector_storage_instance.status().vector_count != 0
|
|
254
|
-
and
|
|
248
|
+
and os.path.exists(content)
|
|
255
249
|
):
|
|
256
250
|
# Get original modified date from file
|
|
257
251
|
modified_date_from_file = (
|
|
258
|
-
self._get_file_modified_date_from_file(
|
|
259
|
-
content_input_path
|
|
260
|
-
)
|
|
252
|
+
self._get_file_modified_date_from_file(content)
|
|
261
253
|
)
|
|
262
254
|
# Get modified date from vector storage
|
|
263
255
|
modified_date_from_storage = (
|
|
@@ -280,18 +272,16 @@ class AutoRetriever:
|
|
|
280
272
|
# Process and store the content to the vector storage
|
|
281
273
|
vr = VectorRetriever(
|
|
282
274
|
storage=vector_storage_instance,
|
|
283
|
-
similarity_threshold=similarity_threshold,
|
|
284
275
|
embedding_model=self.embedding_model,
|
|
285
276
|
)
|
|
286
|
-
vr.process(
|
|
277
|
+
vr.process(content)
|
|
287
278
|
else:
|
|
288
279
|
vr = VectorRetriever(
|
|
289
280
|
storage=vector_storage_instance,
|
|
290
|
-
similarity_threshold=similarity_threshold,
|
|
291
281
|
embedding_model=self.embedding_model,
|
|
292
282
|
)
|
|
293
283
|
# Retrieve info by given query from the vector storage
|
|
294
|
-
retrieved_info = vr.query(query, top_k)
|
|
284
|
+
retrieved_info = vr.query(query, top_k, similarity_threshold)
|
|
295
285
|
all_retrieved_info.extend(retrieved_info)
|
|
296
286
|
except Exception as e:
|
|
297
287
|
raise RuntimeError(
|
|
@@ -11,7 +11,9 @@
|
|
|
11
11
|
# See the License for the specific language governing permissions and
|
|
12
12
|
# limitations under the License.
|
|
13
13
|
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
|
|
14
|
+
import os
|
|
14
15
|
from typing import Any, Dict, List, Optional
|
|
16
|
+
from urllib.parse import urlparse
|
|
15
17
|
|
|
16
18
|
from camel.embeddings import BaseEmbedding, OpenAIEmbedding
|
|
17
19
|
from camel.loaders import UnstructuredIO
|
|
@@ -38,24 +40,18 @@ class VectorRetriever(BaseRetriever):
|
|
|
38
40
|
embedding_model (BaseEmbedding): Embedding model used to generate
|
|
39
41
|
vector embeddings.
|
|
40
42
|
storage (BaseVectorStorage): Vector storage to query.
|
|
41
|
-
similarity_threshold (float, optional): The similarity threshold
|
|
42
|
-
for filtering results. Defaults to `DEFAULT_SIMILARITY_THRESHOLD`.
|
|
43
43
|
unstructured_modules (UnstructuredIO): A module for parsing files and
|
|
44
44
|
URLs and chunking content based on specified parameters.
|
|
45
45
|
"""
|
|
46
46
|
|
|
47
47
|
def __init__(
|
|
48
48
|
self,
|
|
49
|
-
similarity_threshold: float = DEFAULT_SIMILARITY_THRESHOLD,
|
|
50
49
|
embedding_model: Optional[BaseEmbedding] = None,
|
|
51
50
|
storage: Optional[BaseVectorStorage] = None,
|
|
52
51
|
) -> None:
|
|
53
52
|
r"""Initializes the retriever class with an optional embedding model.
|
|
54
53
|
|
|
55
54
|
Args:
|
|
56
|
-
similarity_threshold (float, optional): The similarity threshold
|
|
57
|
-
for filtering results. Defaults to
|
|
58
|
-
`DEFAULT_SIMILARITY_THRESHOLD`.
|
|
59
55
|
embedding_model (Optional[BaseEmbedding]): The embedding model
|
|
60
56
|
instance. Defaults to `OpenAIEmbedding` if not provided.
|
|
61
57
|
storage (BaseVectorStorage): Vector storage to query.
|
|
@@ -68,12 +64,11 @@ class VectorRetriever(BaseRetriever):
|
|
|
68
64
|
vector_dim=self.embedding_model.get_output_dim()
|
|
69
65
|
)
|
|
70
66
|
)
|
|
71
|
-
self.
|
|
72
|
-
self.unstructured_modules: UnstructuredIO = UnstructuredIO()
|
|
67
|
+
self.uio: UnstructuredIO = UnstructuredIO()
|
|
73
68
|
|
|
74
69
|
def process(
|
|
75
70
|
self,
|
|
76
|
-
|
|
71
|
+
content: str,
|
|
77
72
|
chunk_type: str = "chunk_by_title",
|
|
78
73
|
**kwargs: Any,
|
|
79
74
|
) -> None:
|
|
@@ -82,16 +77,19 @@ class VectorRetriever(BaseRetriever):
|
|
|
82
77
|
vector storage.
|
|
83
78
|
|
|
84
79
|
Args:
|
|
85
|
-
|
|
86
|
-
processed.
|
|
80
|
+
contents (str): Local file path, remote URL or string content.
|
|
87
81
|
chunk_type (str): Type of chunking going to apply. Defaults to
|
|
88
82
|
"chunk_by_title".
|
|
89
83
|
**kwargs (Any): Additional keyword arguments for content parsing.
|
|
90
84
|
"""
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
)
|
|
94
|
-
|
|
85
|
+
# Check if the content is URL
|
|
86
|
+
parsed_url = urlparse(content)
|
|
87
|
+
is_url = all([parsed_url.scheme, parsed_url.netloc])
|
|
88
|
+
if is_url or os.path.exists(content):
|
|
89
|
+
elements = self.uio.parse_file_or_url(content, **kwargs)
|
|
90
|
+
else:
|
|
91
|
+
elements = [self.uio.create_element_from_text(text=content)]
|
|
92
|
+
chunks = self.uio.chunk_elements(
|
|
95
93
|
chunk_type=chunk_type, elements=elements
|
|
96
94
|
)
|
|
97
95
|
# Iterate to process and store embeddings, set batch of 50
|
|
@@ -105,7 +103,7 @@ class VectorRetriever(BaseRetriever):
|
|
|
105
103
|
# Prepare the payload for each vector record, includes the content
|
|
106
104
|
# path, chunk metadata, and chunk text
|
|
107
105
|
for vector, chunk in zip(batch_vectors, batch_chunks):
|
|
108
|
-
content_path_info = {"content path":
|
|
106
|
+
content_path_info = {"content path": content}
|
|
109
107
|
chunk_metadata = {"metadata": chunk.metadata.to_dict()}
|
|
110
108
|
chunk_text = {"text": str(chunk)}
|
|
111
109
|
combined_dict = {
|
|
@@ -124,12 +122,16 @@ class VectorRetriever(BaseRetriever):
|
|
|
124
122
|
self,
|
|
125
123
|
query: str,
|
|
126
124
|
top_k: int = DEFAULT_TOP_K_RESULTS,
|
|
125
|
+
similarity_threshold: float = DEFAULT_SIMILARITY_THRESHOLD,
|
|
127
126
|
) -> List[Dict[str, Any]]:
|
|
128
127
|
r"""Executes a query in vector storage and compiles the retrieved
|
|
129
128
|
results into a dictionary.
|
|
130
129
|
|
|
131
130
|
Args:
|
|
132
131
|
query (str): Query string for information retriever.
|
|
132
|
+
similarity_threshold (float, optional): The similarity threshold
|
|
133
|
+
for filtering results. Defaults to
|
|
134
|
+
`DEFAULT_SIMILARITY_THRESHOLD`.
|
|
133
135
|
top_k (int, optional): The number of top results to return during
|
|
134
136
|
retriever. Must be a positive integer. Defaults to 1.
|
|
135
137
|
|
|
@@ -161,7 +163,7 @@ class VectorRetriever(BaseRetriever):
|
|
|
161
163
|
formatted_results = []
|
|
162
164
|
for result in query_results:
|
|
163
165
|
if (
|
|
164
|
-
result.similarity >=
|
|
166
|
+
result.similarity >= similarity_threshold
|
|
165
167
|
and result.record.payload is not None
|
|
166
168
|
):
|
|
167
169
|
result_dict = {
|
|
@@ -182,7 +184,7 @@ class VectorRetriever(BaseRetriever):
|
|
|
182
184
|
'text': (
|
|
183
185
|
f"No suitable information retrieved "
|
|
184
186
|
f"from {content_path} with similarity_threshold"
|
|
185
|
-
f" = {
|
|
187
|
+
f" = {similarity_threshold}."
|
|
186
188
|
)
|
|
187
189
|
}
|
|
188
190
|
]
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
|
|
14
|
+
from .amazon_s3 import AmazonS3Storage
|
|
15
|
+
from .azure_blob import AzureBlobStorage
|
|
16
|
+
from .google_cloud import GoogleCloudStorage
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"AmazonS3Storage",
|
|
20
|
+
"AzureBlobStorage",
|
|
21
|
+
"GoogleCloudStorage",
|
|
22
|
+
]
|