cognee 0.3.4.dev4__py3-none-any.whl → 0.3.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/api/client.py +16 -7
- cognee/api/health.py +5 -9
- cognee/api/v1/add/add.py +3 -1
- cognee/api/v1/cognify/cognify.py +44 -7
- cognee/api/v1/permissions/routers/get_permissions_router.py +8 -4
- cognee/api/v1/search/search.py +3 -0
- cognee/api/v1/ui/__init__.py +1 -1
- cognee/api/v1/ui/ui.py +215 -150
- cognee/api/v1/update/__init__.py +1 -0
- cognee/api/v1/update/routers/__init__.py +1 -0
- cognee/api/v1/update/routers/get_update_router.py +90 -0
- cognee/api/v1/update/update.py +100 -0
- cognee/base_config.py +5 -2
- cognee/cli/_cognee.py +28 -10
- cognee/cli/commands/delete_command.py +34 -2
- cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py +2 -2
- cognee/eval_framework/evaluation/direct_llm_eval_adapter.py +3 -2
- cognee/eval_framework/modal_eval_dashboard.py +9 -1
- cognee/infrastructure/databases/graph/config.py +9 -9
- cognee/infrastructure/databases/graph/get_graph_engine.py +4 -21
- cognee/infrastructure/databases/graph/kuzu/adapter.py +60 -9
- cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py +3 -3
- cognee/infrastructure/databases/relational/config.py +4 -4
- cognee/infrastructure/databases/relational/create_relational_engine.py +11 -3
- cognee/infrastructure/databases/vector/chromadb/ChromaDBAdapter.py +7 -3
- cognee/infrastructure/databases/vector/config.py +7 -7
- cognee/infrastructure/databases/vector/create_vector_engine.py +7 -15
- cognee/infrastructure/databases/vector/embeddings/EmbeddingEngine.py +9 -0
- cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +11 -0
- cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +19 -2
- cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +11 -0
- cognee/infrastructure/databases/vector/embeddings/config.py +8 -0
- cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py +5 -0
- cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +11 -10
- cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +48 -38
- cognee/infrastructure/databases/vector/vector_db_interface.py +8 -4
- cognee/infrastructure/files/storage/S3FileStorage.py +15 -5
- cognee/infrastructure/files/storage/s3_config.py +1 -0
- cognee/infrastructure/files/utils/open_data_file.py +7 -14
- cognee/infrastructure/llm/LLMGateway.py +19 -117
- cognee/infrastructure/llm/config.py +28 -13
- cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/extract_categories.py +2 -1
- cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/extract_event_entities.py +3 -2
- cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/extract_summary.py +3 -2
- cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/knowledge_graph/extract_content_graph.py +2 -1
- cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/knowledge_graph/extract_event_graph.py +3 -2
- cognee/infrastructure/llm/prompts/read_query_prompt.py +3 -2
- cognee/infrastructure/llm/prompts/show_prompt.py +35 -0
- cognee/infrastructure/llm/prompts/test.txt +1 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/__init__.py +2 -2
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/async_client.py +50 -397
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/inlinedbaml.py +2 -3
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/parser.py +8 -88
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/runtime.py +78 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/stream_types.py +2 -99
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/sync_client.py +49 -401
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_builder.py +19 -882
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_map.py +2 -34
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/types.py +2 -107
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/acreate_structured_output.baml +26 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/__init__.py +1 -2
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/acreate_structured_output.py +76 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/create_dynamic_baml_type.py +122 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/generators.baml +3 -3
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +0 -32
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +107 -98
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +5 -6
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +5 -6
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/llm_interface.py +0 -26
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +17 -67
- cognee/infrastructure/llm/tokenizer/Gemini/adapter.py +8 -7
- cognee/infrastructure/llm/utils.py +4 -4
- cognee/infrastructure/loaders/LoaderEngine.py +5 -2
- cognee/infrastructure/loaders/external/__init__.py +7 -0
- cognee/infrastructure/loaders/external/advanced_pdf_loader.py +244 -0
- cognee/infrastructure/loaders/supported_loaders.py +7 -0
- cognee/modules/data/methods/create_authorized_dataset.py +9 -0
- cognee/modules/data/methods/get_authorized_dataset.py +1 -1
- cognee/modules/data/methods/get_authorized_dataset_by_name.py +11 -0
- cognee/modules/data/methods/get_deletion_counts.py +92 -0
- cognee/modules/graph/cognee_graph/CogneeGraph.py +1 -1
- cognee/modules/graph/utils/expand_with_nodes_and_edges.py +22 -8
- cognee/modules/graph/utils/retrieve_existing_edges.py +0 -2
- cognee/modules/ingestion/data_types/TextData.py +0 -1
- cognee/modules/observability/get_observe.py +14 -0
- cognee/modules/observability/observers.py +1 -0
- cognee/modules/ontology/base_ontology_resolver.py +42 -0
- cognee/modules/ontology/get_default_ontology_resolver.py +41 -0
- cognee/modules/ontology/matching_strategies.py +53 -0
- cognee/modules/ontology/models.py +20 -0
- cognee/modules/ontology/ontology_config.py +24 -0
- cognee/modules/ontology/ontology_env_config.py +45 -0
- cognee/modules/ontology/rdf_xml/{OntologyResolver.py → RDFLibOntologyResolver.py} +20 -28
- cognee/modules/pipelines/layers/resolve_authorized_user_dataset.py +21 -24
- cognee/modules/pipelines/layers/resolve_authorized_user_datasets.py +3 -3
- cognee/modules/retrieval/code_retriever.py +2 -1
- cognee/modules/retrieval/context_providers/TripletSearchContextProvider.py +1 -4
- cognee/modules/retrieval/graph_completion_cot_retriever.py +6 -5
- cognee/modules/retrieval/graph_completion_retriever.py +0 -3
- cognee/modules/retrieval/insights_retriever.py +1 -1
- cognee/modules/retrieval/jaccard_retrival.py +60 -0
- cognee/modules/retrieval/lexical_retriever.py +123 -0
- cognee/modules/retrieval/natural_language_retriever.py +2 -1
- cognee/modules/retrieval/temporal_retriever.py +3 -2
- cognee/modules/retrieval/utils/brute_force_triplet_search.py +2 -12
- cognee/modules/retrieval/utils/completion.py +4 -7
- cognee/modules/search/methods/get_search_type_tools.py +7 -0
- cognee/modules/search/methods/no_access_control_search.py +1 -1
- cognee/modules/search/methods/search.py +32 -13
- cognee/modules/search/types/SearchType.py +1 -0
- cognee/modules/users/permissions/methods/authorized_give_permission_on_datasets.py +12 -0
- cognee/modules/users/permissions/methods/check_permission_on_dataset.py +11 -0
- cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py +10 -0
- cognee/modules/users/permissions/methods/get_document_ids_for_user.py +10 -0
- cognee/modules/users/permissions/methods/get_principal.py +9 -0
- cognee/modules/users/permissions/methods/get_principal_datasets.py +11 -0
- cognee/modules/users/permissions/methods/get_role.py +10 -0
- cognee/modules/users/permissions/methods/get_specific_user_permission_datasets.py +3 -3
- cognee/modules/users/permissions/methods/get_tenant.py +9 -0
- cognee/modules/users/permissions/methods/give_default_permission_to_role.py +9 -0
- cognee/modules/users/permissions/methods/give_default_permission_to_tenant.py +9 -0
- cognee/modules/users/permissions/methods/give_default_permission_to_user.py +9 -0
- cognee/modules/users/permissions/methods/give_permission_on_dataset.py +10 -0
- cognee/modules/users/roles/methods/add_user_to_role.py +11 -0
- cognee/modules/users/roles/methods/create_role.py +12 -1
- cognee/modules/users/tenants/methods/add_user_to_tenant.py +12 -0
- cognee/modules/users/tenants/methods/create_tenant.py +12 -1
- cognee/modules/visualization/cognee_network_visualization.py +13 -9
- cognee/shared/data_models.py +0 -1
- cognee/shared/utils.py +0 -32
- cognee/tasks/chunk_naive_llm_classifier/chunk_naive_llm_classifier.py +2 -2
- cognee/tasks/codingagents/coding_rule_associations.py +3 -2
- cognee/tasks/entity_completion/entity_extractors/llm_entity_extractor.py +3 -2
- cognee/tasks/graph/cascade_extract/utils/extract_content_nodes_and_relationship_names.py +3 -2
- cognee/tasks/graph/cascade_extract/utils/extract_edge_triplets.py +3 -2
- cognee/tasks/graph/cascade_extract/utils/extract_nodes.py +3 -2
- cognee/tasks/graph/extract_graph_from_code.py +2 -2
- cognee/tasks/graph/extract_graph_from_data.py +55 -12
- cognee/tasks/graph/extract_graph_from_data_v2.py +16 -4
- cognee/tasks/ingestion/migrate_relational_database.py +132 -41
- cognee/tasks/ingestion/resolve_data_directories.py +4 -1
- cognee/tasks/schema/ingest_database_schema.py +134 -0
- cognee/tasks/schema/models.py +40 -0
- cognee/tasks/storage/index_data_points.py +1 -1
- cognee/tasks/storage/index_graph_edges.py +3 -1
- cognee/tasks/summarization/summarize_code.py +2 -2
- cognee/tasks/summarization/summarize_text.py +2 -2
- cognee/tasks/temporal_graph/enrich_events.py +2 -2
- cognee/tasks/temporal_graph/extract_events_and_entities.py +2 -2
- cognee/tests/cli_tests/cli_unit_tests/test_cli_commands.py +13 -4
- cognee/tests/cli_tests/cli_unit_tests/test_cli_edge_cases.py +13 -3
- cognee/tests/test_advanced_pdf_loader.py +141 -0
- cognee/tests/test_chromadb.py +40 -0
- cognee/tests/test_cognee_server_start.py +6 -1
- cognee/tests/test_data/Quantum_computers.txt +9 -0
- cognee/tests/test_lancedb.py +211 -0
- cognee/tests/test_pgvector.py +40 -0
- cognee/tests/test_relational_db_migration.py +76 -0
- cognee/tests/unit/infrastructure/databases/test_index_graph_edges.py +2 -1
- cognee/tests/unit/modules/ontology/test_ontology_adapter.py +330 -13
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py +0 -4
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +0 -4
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py +0 -4
- {cognee-0.3.4.dev4.dist-info → cognee-0.3.5.dist-info}/METADATA +92 -96
- {cognee-0.3.4.dev4.dist-info → cognee-0.3.5.dist-info}/RECORD +173 -159
- distributed/pyproject.toml +0 -1
- cognee/infrastructure/data/utils/extract_keywords.py +0 -48
- cognee/infrastructure/databases/hybrid/falkordb/FalkorDBAdapter.py +0 -1227
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_categories.baml +0 -109
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_content_graph.baml +0 -343
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/extract_categories.py +0 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/extract_summary.py +0 -89
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/__init__.py +0 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/extract_content_graph.py +0 -44
- cognee/tasks/graph/infer_data_ontology.py +0 -309
- cognee/tests/test_falkordb.py +0 -174
- /cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/__init__.py +0 -0
- /cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/knowledge_graph/__init__.py +0 -0
- /cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/texts.json +0 -0
- {cognee-0.3.4.dev4.dist-info → cognee-0.3.5.dist-info}/WHEEL +0 -0
- {cognee-0.3.4.dev4.dist-info → cognee-0.3.5.dist-info}/entry_points.txt +0 -0
- {cognee-0.3.4.dev4.dist-info → cognee-0.3.5.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.3.4.dev4.dist-info → cognee-0.3.5.dist-info}/licenses/NOTICE.md +0 -0
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
"""Advanced PDF loader leveraging unstructured for layout-aware extraction."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Any, Dict, List, Optional
|
|
7
|
+
import asyncio
|
|
8
|
+
from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
|
|
9
|
+
from cognee.infrastructure.files.utils.get_file_metadata import get_file_metadata
|
|
10
|
+
from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
|
|
11
|
+
from cognee.shared.logging_utils import get_logger
|
|
12
|
+
|
|
13
|
+
from cognee.infrastructure.loaders.external.pypdf_loader import PyPdfLoader
|
|
14
|
+
|
|
15
|
+
logger = get_logger(__name__)
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
from unstructured.partition.pdf import partition_pdf
|
|
19
|
+
except ImportError as e:
|
|
20
|
+
logger.info(
|
|
21
|
+
"unstructured[pdf] not installed, can't use AdvancedPdfLoader, will use PyPdfLoader instead."
|
|
22
|
+
)
|
|
23
|
+
raise ImportError from e
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class _PageBuffer:
|
|
28
|
+
page_num: Optional[int]
|
|
29
|
+
segments: List[str]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class AdvancedPdfLoader(LoaderInterface):
|
|
33
|
+
"""
|
|
34
|
+
PDF loader using unstructured library.
|
|
35
|
+
|
|
36
|
+
Extracts text content, images, tables from PDF files page by page, providing
|
|
37
|
+
structured page information and handling PDF-specific errors.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def supported_extensions(self) -> List[str]:
|
|
42
|
+
return ["pdf"]
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def supported_mime_types(self) -> List[str]:
|
|
46
|
+
return ["application/pdf"]
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def loader_name(self) -> str:
|
|
50
|
+
return "advanced_pdf_loader"
|
|
51
|
+
|
|
52
|
+
def can_handle(self, extension: str, mime_type: str) -> bool:
|
|
53
|
+
"""Check if file can be handled by this loader."""
|
|
54
|
+
# Check file extension
|
|
55
|
+
if extension in self.supported_extensions and mime_type in self.supported_mime_types:
|
|
56
|
+
return True
|
|
57
|
+
|
|
58
|
+
return False
|
|
59
|
+
|
|
60
|
+
async def load(self, file_path: str, strategy: str = "auto", **kwargs: Any) -> str:
|
|
61
|
+
"""Load PDF file using unstructured library. If Exception occurs, fallback to PyPDFLoader.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
file_path: Path to the document file
|
|
65
|
+
strategy: Partitioning strategy ("auto", "fast", "hi_res", "ocr_only")
|
|
66
|
+
**kwargs: Additional arguments passed to unstructured partition
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
LoaderResult with extracted text content and metadata
|
|
70
|
+
|
|
71
|
+
"""
|
|
72
|
+
try:
|
|
73
|
+
logger.info(f"Processing PDF: {file_path}")
|
|
74
|
+
|
|
75
|
+
with open(file_path, "rb") as f:
|
|
76
|
+
file_metadata = await get_file_metadata(f)
|
|
77
|
+
|
|
78
|
+
# Name ingested file of current loader based on original file content hash
|
|
79
|
+
storage_file_name = "text_" + file_metadata["content_hash"] + ".txt"
|
|
80
|
+
|
|
81
|
+
# Set partitioning parameters
|
|
82
|
+
partition_kwargs: Dict[str, Any] = {
|
|
83
|
+
"filename": file_path,
|
|
84
|
+
"strategy": strategy,
|
|
85
|
+
"infer_table_structure": True,
|
|
86
|
+
"include_page_breaks": False,
|
|
87
|
+
"include_metadata": True,
|
|
88
|
+
**kwargs,
|
|
89
|
+
}
|
|
90
|
+
# Use partition to extract elements
|
|
91
|
+
elements = partition_pdf(**partition_kwargs)
|
|
92
|
+
|
|
93
|
+
# Process elements into text content
|
|
94
|
+
page_contents = self._format_elements_by_page(elements)
|
|
95
|
+
|
|
96
|
+
# Check if there is any content
|
|
97
|
+
if not page_contents:
|
|
98
|
+
logger.warning(
|
|
99
|
+
"AdvancedPdfLoader returned no content. Falling back to PyPDF loader."
|
|
100
|
+
)
|
|
101
|
+
return await self._fallback(file_path, **kwargs)
|
|
102
|
+
|
|
103
|
+
# Combine all page outputs
|
|
104
|
+
full_content = "\n".join(page_contents)
|
|
105
|
+
|
|
106
|
+
# Store the content
|
|
107
|
+
storage_config = get_storage_config()
|
|
108
|
+
data_root_directory = storage_config["data_root_directory"]
|
|
109
|
+
storage = get_file_storage(data_root_directory)
|
|
110
|
+
|
|
111
|
+
full_file_path = await storage.store(storage_file_name, full_content)
|
|
112
|
+
|
|
113
|
+
return full_file_path
|
|
114
|
+
|
|
115
|
+
except Exception as exc:
|
|
116
|
+
logger.warning("Failed to process PDF with AdvancedPdfLoader: %s", exc)
|
|
117
|
+
return await self._fallback(file_path, **kwargs)
|
|
118
|
+
|
|
119
|
+
async def _fallback(self, file_path: str, **kwargs: Any) -> str:
|
|
120
|
+
logger.info("Falling back to PyPDF loader for %s", file_path)
|
|
121
|
+
fallback_loader = PyPdfLoader()
|
|
122
|
+
return await fallback_loader.load(file_path, **kwargs)
|
|
123
|
+
|
|
124
|
+
def _format_elements_by_page(self, elements: List[Any]) -> List[str]:
|
|
125
|
+
"""Format elements by page."""
|
|
126
|
+
page_buffers: List[_PageBuffer] = []
|
|
127
|
+
current_buffer = _PageBuffer(page_num=None, segments=[])
|
|
128
|
+
|
|
129
|
+
for element in elements:
|
|
130
|
+
element_dict = self._safe_to_dict(element)
|
|
131
|
+
metadata = element_dict.get("metadata", {})
|
|
132
|
+
page_num = metadata.get("page_number")
|
|
133
|
+
|
|
134
|
+
if current_buffer.page_num != page_num:
|
|
135
|
+
if current_buffer.segments:
|
|
136
|
+
page_buffers.append(current_buffer)
|
|
137
|
+
current_buffer = _PageBuffer(page_num=page_num, segments=[])
|
|
138
|
+
|
|
139
|
+
formatted = self._format_element(element_dict)
|
|
140
|
+
|
|
141
|
+
if formatted:
|
|
142
|
+
current_buffer.segments.append(formatted)
|
|
143
|
+
|
|
144
|
+
if current_buffer.segments:
|
|
145
|
+
page_buffers.append(current_buffer)
|
|
146
|
+
|
|
147
|
+
page_contents: List[str] = []
|
|
148
|
+
for buffer in page_buffers:
|
|
149
|
+
header = f"Page {buffer.page_num}:\n" if buffer.page_num is not None else "Page:"
|
|
150
|
+
content = header + "\n\n".join(buffer.segments) + "\n"
|
|
151
|
+
page_contents.append(str(content))
|
|
152
|
+
return page_contents
|
|
153
|
+
|
|
154
|
+
def _format_element(
|
|
155
|
+
self,
|
|
156
|
+
element: Dict[str, Any],
|
|
157
|
+
) -> str:
|
|
158
|
+
"""Format element."""
|
|
159
|
+
element_type = element.get("type")
|
|
160
|
+
text = self._clean_text(element.get("text", ""))
|
|
161
|
+
metadata = element.get("metadata", {})
|
|
162
|
+
|
|
163
|
+
if element_type.lower() == "table":
|
|
164
|
+
return self._format_table_element(element) or text
|
|
165
|
+
|
|
166
|
+
if element_type.lower() == "image":
|
|
167
|
+
description = text or self._format_image_element(metadata)
|
|
168
|
+
return description
|
|
169
|
+
|
|
170
|
+
# Ignore header and footer
|
|
171
|
+
if element_type.lower() in ["header", "footer"]:
|
|
172
|
+
pass
|
|
173
|
+
|
|
174
|
+
return text
|
|
175
|
+
|
|
176
|
+
def _format_table_element(self, element: Dict[str, Any]) -> str:
|
|
177
|
+
"""Format table element."""
|
|
178
|
+
metadata = element.get("metadata", {})
|
|
179
|
+
text = self._clean_text(element.get("text", ""))
|
|
180
|
+
table_html = metadata.get("text_as_html")
|
|
181
|
+
|
|
182
|
+
if table_html:
|
|
183
|
+
return table_html.strip()
|
|
184
|
+
|
|
185
|
+
return text
|
|
186
|
+
|
|
187
|
+
def _format_image_element(self, metadata: Dict[str, Any]) -> str:
|
|
188
|
+
"""Format image."""
|
|
189
|
+
placeholder = "[Image omitted]"
|
|
190
|
+
image_text = placeholder
|
|
191
|
+
coordinates = metadata.get("coordinates", {})
|
|
192
|
+
points = coordinates.get("points") if isinstance(coordinates, dict) else None
|
|
193
|
+
if points and isinstance(points, tuple) and len(points) == 4:
|
|
194
|
+
leftup = points[0]
|
|
195
|
+
rightdown = points[3]
|
|
196
|
+
if (
|
|
197
|
+
isinstance(leftup, tuple)
|
|
198
|
+
and isinstance(rightdown, tuple)
|
|
199
|
+
and len(leftup) == 2
|
|
200
|
+
and len(rightdown) == 2
|
|
201
|
+
):
|
|
202
|
+
image_text = f"{placeholder} (bbox=({leftup[0]}, {leftup[1]}, {rightdown[0]}, {rightdown[1]}))"
|
|
203
|
+
|
|
204
|
+
layout_width = coordinates.get("layout_width")
|
|
205
|
+
layout_height = coordinates.get("layout_height")
|
|
206
|
+
system = coordinates.get("system")
|
|
207
|
+
if layout_width and layout_height and system:
|
|
208
|
+
image_text = (
|
|
209
|
+
image_text
|
|
210
|
+
+ f", system={system}, layout_width={layout_width}, layout_height={layout_height}))"
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
return image_text
|
|
214
|
+
|
|
215
|
+
def _safe_to_dict(self, element: Any) -> Dict[str, Any]:
|
|
216
|
+
"""Safe to dict."""
|
|
217
|
+
try:
|
|
218
|
+
if hasattr(element, "to_dict"):
|
|
219
|
+
return element.to_dict()
|
|
220
|
+
except Exception:
|
|
221
|
+
pass
|
|
222
|
+
fallback_type = getattr(element, "category", None)
|
|
223
|
+
if not fallback_type:
|
|
224
|
+
fallback_type = getattr(element, "__class__", type("", (), {})).__name__
|
|
225
|
+
|
|
226
|
+
return {
|
|
227
|
+
"type": fallback_type,
|
|
228
|
+
"text": getattr(element, "text", ""),
|
|
229
|
+
"metadata": getattr(element, "metadata", {}),
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
def _clean_text(self, value: Any) -> str:
|
|
233
|
+
if value is None:
|
|
234
|
+
return ""
|
|
235
|
+
return str(value).replace("\xa0", " ").strip()
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
if __name__ == "__main__":
|
|
239
|
+
loader = AdvancedPdfLoader()
|
|
240
|
+
asyncio.run(
|
|
241
|
+
loader.load(
|
|
242
|
+
"/Users/xiaotao/work/cognee/cognee/infrastructure/loaders/external/attention_is_all_you_need.pdf"
|
|
243
|
+
)
|
|
244
|
+
)
|
|
@@ -16,3 +16,10 @@ try:
|
|
|
16
16
|
supported_loaders[UnstructuredLoader.loader_name] = UnstructuredLoader
|
|
17
17
|
except ImportError:
|
|
18
18
|
pass
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
from cognee.infrastructure.loaders.external import AdvancedPdfLoader
|
|
22
|
+
|
|
23
|
+
supported_loaders[AdvancedPdfLoader.loader_name] = AdvancedPdfLoader
|
|
24
|
+
except ImportError:
|
|
25
|
+
pass
|
|
@@ -6,6 +6,15 @@ from .create_dataset import create_dataset
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
async def create_authorized_dataset(dataset_name: str, user: User) -> Dataset:
|
|
9
|
+
"""
|
|
10
|
+
Create a new dataset and give all permissions on this dataset to the given user.
|
|
11
|
+
Args:
|
|
12
|
+
dataset_name: Name of the dataset.
|
|
13
|
+
user: The user object.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
Dataset: The new authorized dataset.
|
|
17
|
+
"""
|
|
9
18
|
db_engine = get_relational_engine()
|
|
10
19
|
|
|
11
20
|
async with db_engine.get_async_session() as session:
|
|
@@ -15,7 +15,7 @@ async def get_authorized_dataset(
|
|
|
15
15
|
Get a specific dataset with permissions for a user.
|
|
16
16
|
|
|
17
17
|
Args:
|
|
18
|
-
|
|
18
|
+
user: User object
|
|
19
19
|
dataset_id (UUID): dataset id
|
|
20
20
|
permission_type (str): permission type(read, write, delete, share), default is read
|
|
21
21
|
|
|
@@ -11,6 +11,17 @@ from ..models import Dataset
|
|
|
11
11
|
async def get_authorized_dataset_by_name(
|
|
12
12
|
dataset_name: str, user: User, permission_type: str
|
|
13
13
|
) -> Optional[Dataset]:
|
|
14
|
+
"""
|
|
15
|
+
Get a specific dataset with the given name, with permissions for a given user.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
dataset_name: Name of the dataset.
|
|
19
|
+
user: User object.
|
|
20
|
+
permission_type (str): permission type(read, write, delete, share), default is read
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
Optional[Dataset]: dataset with permissions
|
|
24
|
+
"""
|
|
14
25
|
authorized_datasets = await get_authorized_existing_datasets([], permission_type, user)
|
|
15
26
|
|
|
16
27
|
return next((dataset for dataset in authorized_datasets if dataset.name == dataset_name), None)
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
from uuid import UUID
|
|
2
|
+
from cognee.cli.exceptions import CliCommandException
|
|
3
|
+
from cognee.infrastructure.databases.exceptions.exceptions import EntityNotFoundError
|
|
4
|
+
from sqlalchemy import select
|
|
5
|
+
from sqlalchemy.sql import func
|
|
6
|
+
from cognee.infrastructure.databases.relational import get_relational_engine
|
|
7
|
+
from cognee.modules.data.models import Dataset, Data, DatasetData
|
|
8
|
+
from cognee.modules.users.models import User
|
|
9
|
+
from cognee.modules.users.methods import get_user
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class DeletionCountsPreview:
|
|
15
|
+
datasets: int = 0
|
|
16
|
+
data_entries: int = 0
|
|
17
|
+
users: int = 0
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
async def get_deletion_counts(
|
|
21
|
+
dataset_name: str = None, user_id: str = None, all_data: bool = False
|
|
22
|
+
) -> DeletionCountsPreview:
|
|
23
|
+
"""
|
|
24
|
+
Calculates the number of items that will be deleted based on the provided arguments.
|
|
25
|
+
"""
|
|
26
|
+
counts = DeletionCountsPreview()
|
|
27
|
+
relational_engine = get_relational_engine()
|
|
28
|
+
async with relational_engine.get_async_session() as session:
|
|
29
|
+
if dataset_name:
|
|
30
|
+
# Find the dataset by name
|
|
31
|
+
dataset_result = await session.execute(
|
|
32
|
+
select(Dataset).where(Dataset.name == dataset_name)
|
|
33
|
+
)
|
|
34
|
+
dataset = dataset_result.scalar_one_or_none()
|
|
35
|
+
|
|
36
|
+
if dataset is None:
|
|
37
|
+
raise CliCommandException(
|
|
38
|
+
f"No Dataset exists with the name {dataset_name}", error_code=1
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Count data entries linked to this dataset
|
|
42
|
+
count_query = (
|
|
43
|
+
select(func.count())
|
|
44
|
+
.select_from(DatasetData)
|
|
45
|
+
.where(DatasetData.dataset_id == dataset.id)
|
|
46
|
+
)
|
|
47
|
+
data_entry_count = (await session.execute(count_query)).scalar_one()
|
|
48
|
+
counts.users = 1
|
|
49
|
+
counts.datasets = 1
|
|
50
|
+
counts.entries = data_entry_count
|
|
51
|
+
return counts
|
|
52
|
+
|
|
53
|
+
elif all_data:
|
|
54
|
+
# Simplified logic: Get total counts directly from the tables.
|
|
55
|
+
counts.datasets = (
|
|
56
|
+
await session.execute(select(func.count()).select_from(Dataset))
|
|
57
|
+
).scalar_one()
|
|
58
|
+
counts.entries = (
|
|
59
|
+
await session.execute(select(func.count()).select_from(Data))
|
|
60
|
+
).scalar_one()
|
|
61
|
+
counts.users = (
|
|
62
|
+
await session.execute(select(func.count()).select_from(User))
|
|
63
|
+
).scalar_one()
|
|
64
|
+
return counts
|
|
65
|
+
|
|
66
|
+
# Placeholder for user_id logic
|
|
67
|
+
elif user_id:
|
|
68
|
+
user = None
|
|
69
|
+
try:
|
|
70
|
+
user_uuid = UUID(user_id)
|
|
71
|
+
user = await get_user(user_uuid)
|
|
72
|
+
except (ValueError, EntityNotFoundError):
|
|
73
|
+
raise CliCommandException(f"No User exists with ID {user_id}", error_code=1)
|
|
74
|
+
counts.users = 1
|
|
75
|
+
# Find all datasets owned by this user
|
|
76
|
+
datasets_query = select(Dataset).where(Dataset.owner_id == user.id)
|
|
77
|
+
user_datasets = (await session.execute(datasets_query)).scalars().all()
|
|
78
|
+
dataset_count = len(user_datasets)
|
|
79
|
+
counts.datasets = dataset_count
|
|
80
|
+
if dataset_count > 0:
|
|
81
|
+
dataset_ids = [d.id for d in user_datasets]
|
|
82
|
+
# Count all data entries across all of the user's datasets
|
|
83
|
+
data_count_query = (
|
|
84
|
+
select(func.count())
|
|
85
|
+
.select_from(DatasetData)
|
|
86
|
+
.where(DatasetData.dataset_id.in_(dataset_ids))
|
|
87
|
+
)
|
|
88
|
+
data_entry_count = (await session.execute(data_count_query)).scalar_one()
|
|
89
|
+
counts.entries = data_entry_count
|
|
90
|
+
else:
|
|
91
|
+
counts.entries = 0
|
|
92
|
+
return counts
|
|
@@ -161,7 +161,7 @@ class CogneeGraph(CogneeAbstractGraph):
|
|
|
161
161
|
edge_distances = await vector_engine.search(
|
|
162
162
|
collection_name="EdgeType_relationship_name",
|
|
163
163
|
query_vector=query_vector,
|
|
164
|
-
limit=
|
|
164
|
+
limit=None,
|
|
165
165
|
)
|
|
166
166
|
projection_time = time.time() - start_time
|
|
167
167
|
logger.info(
|
|
@@ -7,8 +7,14 @@ from cognee.modules.engine.utils import (
|
|
|
7
7
|
generate_node_id,
|
|
8
8
|
generate_node_name,
|
|
9
9
|
)
|
|
10
|
+
from cognee.modules.ontology.base_ontology_resolver import BaseOntologyResolver
|
|
11
|
+
from cognee.modules.ontology.ontology_env_config import get_ontology_env_config
|
|
10
12
|
from cognee.shared.data_models import KnowledgeGraph
|
|
11
|
-
from cognee.modules.ontology.rdf_xml.
|
|
13
|
+
from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver
|
|
14
|
+
from cognee.modules.ontology.get_default_ontology_resolver import (
|
|
15
|
+
get_default_ontology_resolver,
|
|
16
|
+
get_ontology_resolver_from_env,
|
|
17
|
+
)
|
|
12
18
|
|
|
13
19
|
|
|
14
20
|
def _create_node_key(node_id: str, category: str) -> str:
|
|
@@ -83,7 +89,7 @@ def _process_ontology_edges(
|
|
|
83
89
|
|
|
84
90
|
def _create_type_node(
|
|
85
91
|
node_type: str,
|
|
86
|
-
ontology_resolver:
|
|
92
|
+
ontology_resolver: RDFLibOntologyResolver,
|
|
87
93
|
added_nodes_map: dict,
|
|
88
94
|
added_ontology_nodes_map: dict,
|
|
89
95
|
name_mapping: dict,
|
|
@@ -141,7 +147,7 @@ def _create_entity_node(
|
|
|
141
147
|
node_name: str,
|
|
142
148
|
node_description: str,
|
|
143
149
|
type_node: EntityType,
|
|
144
|
-
ontology_resolver:
|
|
150
|
+
ontology_resolver: RDFLibOntologyResolver,
|
|
145
151
|
added_nodes_map: dict,
|
|
146
152
|
added_ontology_nodes_map: dict,
|
|
147
153
|
name_mapping: dict,
|
|
@@ -198,7 +204,7 @@ def _create_entity_node(
|
|
|
198
204
|
def _process_graph_nodes(
|
|
199
205
|
data_chunk: DocumentChunk,
|
|
200
206
|
graph: KnowledgeGraph,
|
|
201
|
-
ontology_resolver:
|
|
207
|
+
ontology_resolver: RDFLibOntologyResolver,
|
|
202
208
|
added_nodes_map: dict,
|
|
203
209
|
added_ontology_nodes_map: dict,
|
|
204
210
|
name_mapping: dict,
|
|
@@ -277,7 +283,7 @@ def _process_graph_edges(
|
|
|
277
283
|
def expand_with_nodes_and_edges(
|
|
278
284
|
data_chunks: list[DocumentChunk],
|
|
279
285
|
chunk_graphs: list[KnowledgeGraph],
|
|
280
|
-
ontology_resolver:
|
|
286
|
+
ontology_resolver: BaseOntologyResolver = None,
|
|
281
287
|
existing_edges_map: Optional[dict[str, bool]] = None,
|
|
282
288
|
):
|
|
283
289
|
"""
|
|
@@ -296,8 +302,8 @@ def expand_with_nodes_and_edges(
|
|
|
296
302
|
chunk_graphs (list[KnowledgeGraph]): List of knowledge graphs corresponding to each
|
|
297
303
|
data chunk. Each graph contains nodes (entities) and edges (relationships) extracted
|
|
298
304
|
from the chunk content.
|
|
299
|
-
ontology_resolver (
|
|
300
|
-
types against an ontology. If None, a default
|
|
305
|
+
ontology_resolver (BaseOntologyResolver, optional): Resolver for validating entities and
|
|
306
|
+
types against an ontology. If None, a default RDFLibOntologyResolver is created.
|
|
301
307
|
Defaults to None.
|
|
302
308
|
existing_edges_map (dict[str, bool], optional): Mapping of existing edge keys to prevent
|
|
303
309
|
duplicate edge creation. Keys are formatted as "{source_id}_{target_id}_{relation}".
|
|
@@ -320,7 +326,15 @@ def expand_with_nodes_and_edges(
|
|
|
320
326
|
existing_edges_map = {}
|
|
321
327
|
|
|
322
328
|
if ontology_resolver is None:
|
|
323
|
-
|
|
329
|
+
ontology_config = get_ontology_env_config()
|
|
330
|
+
if (
|
|
331
|
+
ontology_config.ontology_file_path
|
|
332
|
+
and ontology_config.ontology_resolver
|
|
333
|
+
and ontology_config.matching_strategy
|
|
334
|
+
):
|
|
335
|
+
ontology_resolver = get_ontology_resolver_from_env(**ontology_config.to_dict())
|
|
336
|
+
else:
|
|
337
|
+
ontology_resolver = get_default_ontology_resolver()
|
|
324
338
|
|
|
325
339
|
added_nodes_map = {}
|
|
326
340
|
added_ontology_nodes_map = {}
|
|
@@ -23,8 +23,6 @@ async def retrieve_existing_edges(
|
|
|
23
23
|
chunk_graphs (list[KnowledgeGraph]): List of knowledge graphs corresponding to each
|
|
24
24
|
data chunk. Each graph contains nodes (entities) and edges (relationships) that
|
|
25
25
|
were extracted from the chunk content.
|
|
26
|
-
graph_engine (GraphDBInterface): Interface to the graph database that will be queried
|
|
27
|
-
to check for existing edges. Must implement the has_edges() method.
|
|
28
26
|
|
|
29
27
|
Returns:
|
|
30
28
|
dict[str, bool]: A mapping of edge keys to boolean values indicating existence.
|
|
@@ -9,3 +9,17 @@ def get_observe():
|
|
|
9
9
|
from langfuse.decorators import observe
|
|
10
10
|
|
|
11
11
|
return observe
|
|
12
|
+
elif monitoring == Observer.NONE:
|
|
13
|
+
# Return a no-op decorator that handles keyword arguments
|
|
14
|
+
def no_op_decorator(*args, **kwargs):
|
|
15
|
+
if len(args) == 1 and callable(args[0]) and not kwargs:
|
|
16
|
+
# Direct decoration: @observe
|
|
17
|
+
return args[0]
|
|
18
|
+
else:
|
|
19
|
+
# Parameterized decoration: @observe(as_type="generation")
|
|
20
|
+
def decorator(func):
|
|
21
|
+
return func
|
|
22
|
+
|
|
23
|
+
return decorator
|
|
24
|
+
|
|
25
|
+
return no_op_decorator
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import List, Tuple, Optional
|
|
3
|
+
|
|
4
|
+
from cognee.modules.ontology.models import AttachedOntologyNode
|
|
5
|
+
from cognee.modules.ontology.matching_strategies import MatchingStrategy, FuzzyMatchingStrategy
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class BaseOntologyResolver(ABC):
|
|
9
|
+
"""Abstract base class for ontology resolvers."""
|
|
10
|
+
|
|
11
|
+
def __init__(self, matching_strategy: Optional[MatchingStrategy] = None):
|
|
12
|
+
"""Initialize the ontology resolver with a matching strategy.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
matching_strategy: The strategy to use for entity matching.
|
|
16
|
+
Defaults to FuzzyMatchingStrategy if None.
|
|
17
|
+
"""
|
|
18
|
+
self.matching_strategy = matching_strategy or FuzzyMatchingStrategy()
|
|
19
|
+
|
|
20
|
+
@abstractmethod
|
|
21
|
+
def build_lookup(self) -> None:
|
|
22
|
+
"""Build the lookup dictionary for ontology entities."""
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
@abstractmethod
|
|
26
|
+
def refresh_lookup(self) -> None:
|
|
27
|
+
"""Refresh the lookup dictionary."""
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
@abstractmethod
|
|
31
|
+
def find_closest_match(self, name: str, category: str) -> Optional[str]:
|
|
32
|
+
"""Find the closest match for a given name in the specified category."""
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
@abstractmethod
|
|
36
|
+
def get_subgraph(
|
|
37
|
+
self, node_name: str, node_type: str = "individuals", directed: bool = True
|
|
38
|
+
) -> Tuple[
|
|
39
|
+
List[AttachedOntologyNode], List[Tuple[str, str, str]], Optional[AttachedOntologyNode]
|
|
40
|
+
]:
|
|
41
|
+
"""Get a subgraph for the given node."""
|
|
42
|
+
pass
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from cognee.modules.ontology.base_ontology_resolver import BaseOntologyResolver
|
|
2
|
+
from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver
|
|
3
|
+
from cognee.modules.ontology.matching_strategies import FuzzyMatchingStrategy
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_default_ontology_resolver() -> BaseOntologyResolver:
|
|
7
|
+
return RDFLibOntologyResolver(ontology_file=None, matching_strategy=FuzzyMatchingStrategy())
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_ontology_resolver_from_env(
|
|
11
|
+
ontology_resolver: str = "", matching_strategy: str = "", ontology_file_path: str = ""
|
|
12
|
+
) -> BaseOntologyResolver:
|
|
13
|
+
"""
|
|
14
|
+
Create and return an ontology resolver instance based on environment parameters.
|
|
15
|
+
|
|
16
|
+
Currently, this function supports only the RDFLib-based ontology resolver
|
|
17
|
+
with a fuzzy matching strategy.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
ontology_resolver (str): The ontology resolver type to use.
|
|
21
|
+
Supported value: "rdflib".
|
|
22
|
+
matching_strategy (str): The matching strategy to apply.
|
|
23
|
+
Supported value: "fuzzy".
|
|
24
|
+
ontology_file_path (str): Path to the ontology file required for the resolver.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
BaseOntologyResolver: An instance of the requested ontology resolver.
|
|
28
|
+
|
|
29
|
+
Raises:
|
|
30
|
+
EnvironmentError: If the provided resolver or strategy is unsupported,
|
|
31
|
+
or if required parameters are missing.
|
|
32
|
+
"""
|
|
33
|
+
if ontology_resolver == "rdflib" and matching_strategy == "fuzzy" and ontology_file_path:
|
|
34
|
+
return RDFLibOntologyResolver(
|
|
35
|
+
matching_strategy=FuzzyMatchingStrategy(), ontology_file=ontology_file_path
|
|
36
|
+
)
|
|
37
|
+
else:
|
|
38
|
+
raise EnvironmentError(
|
|
39
|
+
f"Unsupported ontology resolver: {ontology_resolver}. "
|
|
40
|
+
f"Supported resolvers are: RdfLib with FuzzyMatchingStrategy."
|
|
41
|
+
)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import difflib
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from typing import List, Optional
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class MatchingStrategy(ABC):
|
|
7
|
+
"""Abstract base class for ontology entity matching strategies."""
|
|
8
|
+
|
|
9
|
+
@abstractmethod
|
|
10
|
+
def find_match(self, name: str, candidates: List[str]) -> Optional[str]:
|
|
11
|
+
"""Find the best match for a given name from a list of candidates.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
name: The name to match
|
|
15
|
+
candidates: List of candidate names to match against
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
The best matching candidate name, or None if no match found
|
|
19
|
+
"""
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class FuzzyMatchingStrategy(MatchingStrategy):
|
|
24
|
+
"""Fuzzy matching strategy using difflib for approximate string matching."""
|
|
25
|
+
|
|
26
|
+
def __init__(self, cutoff: float = 0.8):
|
|
27
|
+
"""Initialize fuzzy matching strategy.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
cutoff: Minimum similarity score (0.0 to 1.0) for a match to be considered valid
|
|
31
|
+
"""
|
|
32
|
+
self.cutoff = cutoff
|
|
33
|
+
|
|
34
|
+
def find_match(self, name: str, candidates: List[str]) -> Optional[str]:
|
|
35
|
+
"""Find the closest fuzzy match for a given name.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
name: The normalized name to match
|
|
39
|
+
candidates: List of normalized candidate names
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
The best matching candidate name, or None if no match meets the cutoff
|
|
43
|
+
"""
|
|
44
|
+
if not candidates:
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
# Check for exact match first
|
|
48
|
+
if name in candidates:
|
|
49
|
+
return name
|
|
50
|
+
|
|
51
|
+
# Find fuzzy match
|
|
52
|
+
best_match = difflib.get_close_matches(name, candidates, n=1, cutoff=self.cutoff)
|
|
53
|
+
return best_match[0] if best_match else None
|