aiecs 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiecs/__init__.py +72 -0
- aiecs/__main__.py +41 -0
- aiecs/aiecs_client.py +469 -0
- aiecs/application/__init__.py +10 -0
- aiecs/application/executors/__init__.py +10 -0
- aiecs/application/executors/operation_executor.py +363 -0
- aiecs/application/knowledge_graph/__init__.py +7 -0
- aiecs/application/knowledge_graph/builder/__init__.py +37 -0
- aiecs/application/knowledge_graph/builder/document_builder.py +375 -0
- aiecs/application/knowledge_graph/builder/graph_builder.py +356 -0
- aiecs/application/knowledge_graph/builder/schema_mapping.py +531 -0
- aiecs/application/knowledge_graph/builder/structured_pipeline.py +443 -0
- aiecs/application/knowledge_graph/builder/text_chunker.py +319 -0
- aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
- aiecs/application/knowledge_graph/extractors/base.py +100 -0
- aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +327 -0
- aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +349 -0
- aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +244 -0
- aiecs/application/knowledge_graph/fusion/__init__.py +23 -0
- aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +387 -0
- aiecs/application/knowledge_graph/fusion/entity_linker.py +343 -0
- aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +580 -0
- aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +189 -0
- aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
- aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +344 -0
- aiecs/application/knowledge_graph/pattern_matching/query_executor.py +378 -0
- aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
- aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +199 -0
- aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
- aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
- aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +347 -0
- aiecs/application/knowledge_graph/reasoning/inference_engine.py +504 -0
- aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +167 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +630 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +654 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +477 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +390 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +217 -0
- aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +169 -0
- aiecs/application/knowledge_graph/reasoning/query_planner.py +872 -0
- aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +554 -0
- aiecs/application/knowledge_graph/retrieval/__init__.py +19 -0
- aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +596 -0
- aiecs/application/knowledge_graph/search/__init__.py +59 -0
- aiecs/application/knowledge_graph/search/hybrid_search.py +423 -0
- aiecs/application/knowledge_graph/search/reranker.py +295 -0
- aiecs/application/knowledge_graph/search/reranker_strategies.py +553 -0
- aiecs/application/knowledge_graph/search/text_similarity.py +398 -0
- aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
- aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +329 -0
- aiecs/application/knowledge_graph/traversal/path_scorer.py +269 -0
- aiecs/application/knowledge_graph/validators/__init__.py +13 -0
- aiecs/application/knowledge_graph/validators/relation_validator.py +189 -0
- aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
- aiecs/application/knowledge_graph/visualization/graph_visualizer.py +321 -0
- aiecs/common/__init__.py +9 -0
- aiecs/common/knowledge_graph/__init__.py +17 -0
- aiecs/common/knowledge_graph/runnable.py +484 -0
- aiecs/config/__init__.py +16 -0
- aiecs/config/config.py +498 -0
- aiecs/config/graph_config.py +137 -0
- aiecs/config/registry.py +23 -0
- aiecs/core/__init__.py +46 -0
- aiecs/core/interface/__init__.py +34 -0
- aiecs/core/interface/execution_interface.py +152 -0
- aiecs/core/interface/storage_interface.py +171 -0
- aiecs/domain/__init__.py +289 -0
- aiecs/domain/agent/__init__.py +189 -0
- aiecs/domain/agent/base_agent.py +697 -0
- aiecs/domain/agent/exceptions.py +103 -0
- aiecs/domain/agent/graph_aware_mixin.py +559 -0
- aiecs/domain/agent/hybrid_agent.py +490 -0
- aiecs/domain/agent/integration/__init__.py +26 -0
- aiecs/domain/agent/integration/context_compressor.py +222 -0
- aiecs/domain/agent/integration/context_engine_adapter.py +252 -0
- aiecs/domain/agent/integration/retry_policy.py +219 -0
- aiecs/domain/agent/integration/role_config.py +213 -0
- aiecs/domain/agent/knowledge_aware_agent.py +646 -0
- aiecs/domain/agent/lifecycle.py +296 -0
- aiecs/domain/agent/llm_agent.py +300 -0
- aiecs/domain/agent/memory/__init__.py +12 -0
- aiecs/domain/agent/memory/conversation.py +197 -0
- aiecs/domain/agent/migration/__init__.py +14 -0
- aiecs/domain/agent/migration/conversion.py +160 -0
- aiecs/domain/agent/migration/legacy_wrapper.py +90 -0
- aiecs/domain/agent/models.py +317 -0
- aiecs/domain/agent/observability.py +407 -0
- aiecs/domain/agent/persistence.py +289 -0
- aiecs/domain/agent/prompts/__init__.py +29 -0
- aiecs/domain/agent/prompts/builder.py +161 -0
- aiecs/domain/agent/prompts/formatters.py +189 -0
- aiecs/domain/agent/prompts/template.py +255 -0
- aiecs/domain/agent/registry.py +260 -0
- aiecs/domain/agent/tool_agent.py +257 -0
- aiecs/domain/agent/tools/__init__.py +12 -0
- aiecs/domain/agent/tools/schema_generator.py +221 -0
- aiecs/domain/community/__init__.py +155 -0
- aiecs/domain/community/agent_adapter.py +477 -0
- aiecs/domain/community/analytics.py +481 -0
- aiecs/domain/community/collaborative_workflow.py +642 -0
- aiecs/domain/community/communication_hub.py +645 -0
- aiecs/domain/community/community_builder.py +320 -0
- aiecs/domain/community/community_integration.py +800 -0
- aiecs/domain/community/community_manager.py +813 -0
- aiecs/domain/community/decision_engine.py +879 -0
- aiecs/domain/community/exceptions.py +225 -0
- aiecs/domain/community/models/__init__.py +33 -0
- aiecs/domain/community/models/community_models.py +268 -0
- aiecs/domain/community/resource_manager.py +457 -0
- aiecs/domain/community/shared_context_manager.py +603 -0
- aiecs/domain/context/__init__.py +58 -0
- aiecs/domain/context/context_engine.py +989 -0
- aiecs/domain/context/conversation_models.py +354 -0
- aiecs/domain/context/graph_memory.py +467 -0
- aiecs/domain/execution/__init__.py +12 -0
- aiecs/domain/execution/model.py +57 -0
- aiecs/domain/knowledge_graph/__init__.py +19 -0
- aiecs/domain/knowledge_graph/models/__init__.py +52 -0
- aiecs/domain/knowledge_graph/models/entity.py +130 -0
- aiecs/domain/knowledge_graph/models/evidence.py +194 -0
- aiecs/domain/knowledge_graph/models/inference_rule.py +186 -0
- aiecs/domain/knowledge_graph/models/path.py +179 -0
- aiecs/domain/knowledge_graph/models/path_pattern.py +173 -0
- aiecs/domain/knowledge_graph/models/query.py +272 -0
- aiecs/domain/knowledge_graph/models/query_plan.py +187 -0
- aiecs/domain/knowledge_graph/models/relation.py +136 -0
- aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
- aiecs/domain/knowledge_graph/schema/entity_type.py +135 -0
- aiecs/domain/knowledge_graph/schema/graph_schema.py +271 -0
- aiecs/domain/knowledge_graph/schema/property_schema.py +155 -0
- aiecs/domain/knowledge_graph/schema/relation_type.py +171 -0
- aiecs/domain/knowledge_graph/schema/schema_manager.py +496 -0
- aiecs/domain/knowledge_graph/schema/type_enums.py +205 -0
- aiecs/domain/task/__init__.py +13 -0
- aiecs/domain/task/dsl_processor.py +613 -0
- aiecs/domain/task/model.py +62 -0
- aiecs/domain/task/task_context.py +268 -0
- aiecs/infrastructure/__init__.py +24 -0
- aiecs/infrastructure/graph_storage/__init__.py +11 -0
- aiecs/infrastructure/graph_storage/base.py +601 -0
- aiecs/infrastructure/graph_storage/batch_operations.py +449 -0
- aiecs/infrastructure/graph_storage/cache.py +429 -0
- aiecs/infrastructure/graph_storage/distributed.py +226 -0
- aiecs/infrastructure/graph_storage/error_handling.py +390 -0
- aiecs/infrastructure/graph_storage/graceful_degradation.py +306 -0
- aiecs/infrastructure/graph_storage/health_checks.py +378 -0
- aiecs/infrastructure/graph_storage/in_memory.py +514 -0
- aiecs/infrastructure/graph_storage/index_optimization.py +483 -0
- aiecs/infrastructure/graph_storage/lazy_loading.py +410 -0
- aiecs/infrastructure/graph_storage/metrics.py +357 -0
- aiecs/infrastructure/graph_storage/migration.py +413 -0
- aiecs/infrastructure/graph_storage/pagination.py +471 -0
- aiecs/infrastructure/graph_storage/performance_monitoring.py +466 -0
- aiecs/infrastructure/graph_storage/postgres.py +871 -0
- aiecs/infrastructure/graph_storage/query_optimizer.py +635 -0
- aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
- aiecs/infrastructure/graph_storage/sqlite.py +623 -0
- aiecs/infrastructure/graph_storage/streaming.py +495 -0
- aiecs/infrastructure/messaging/__init__.py +13 -0
- aiecs/infrastructure/messaging/celery_task_manager.py +383 -0
- aiecs/infrastructure/messaging/websocket_manager.py +298 -0
- aiecs/infrastructure/monitoring/__init__.py +34 -0
- aiecs/infrastructure/monitoring/executor_metrics.py +174 -0
- aiecs/infrastructure/monitoring/global_metrics_manager.py +213 -0
- aiecs/infrastructure/monitoring/structured_logger.py +48 -0
- aiecs/infrastructure/monitoring/tracing_manager.py +410 -0
- aiecs/infrastructure/persistence/__init__.py +24 -0
- aiecs/infrastructure/persistence/context_engine_client.py +187 -0
- aiecs/infrastructure/persistence/database_manager.py +333 -0
- aiecs/infrastructure/persistence/file_storage.py +754 -0
- aiecs/infrastructure/persistence/redis_client.py +220 -0
- aiecs/llm/__init__.py +86 -0
- aiecs/llm/callbacks/__init__.py +11 -0
- aiecs/llm/callbacks/custom_callbacks.py +264 -0
- aiecs/llm/client_factory.py +420 -0
- aiecs/llm/clients/__init__.py +33 -0
- aiecs/llm/clients/base_client.py +193 -0
- aiecs/llm/clients/googleai_client.py +181 -0
- aiecs/llm/clients/openai_client.py +131 -0
- aiecs/llm/clients/vertex_client.py +437 -0
- aiecs/llm/clients/xai_client.py +184 -0
- aiecs/llm/config/__init__.py +51 -0
- aiecs/llm/config/config_loader.py +275 -0
- aiecs/llm/config/config_validator.py +236 -0
- aiecs/llm/config/model_config.py +151 -0
- aiecs/llm/utils/__init__.py +10 -0
- aiecs/llm/utils/validate_config.py +91 -0
- aiecs/main.py +363 -0
- aiecs/scripts/__init__.py +3 -0
- aiecs/scripts/aid/VERSION_MANAGEMENT.md +97 -0
- aiecs/scripts/aid/__init__.py +19 -0
- aiecs/scripts/aid/version_manager.py +215 -0
- aiecs/scripts/dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md +242 -0
- aiecs/scripts/dependance_check/README_DEPENDENCY_CHECKER.md +310 -0
- aiecs/scripts/dependance_check/__init__.py +17 -0
- aiecs/scripts/dependance_check/dependency_checker.py +938 -0
- aiecs/scripts/dependance_check/dependency_fixer.py +391 -0
- aiecs/scripts/dependance_check/download_nlp_data.py +396 -0
- aiecs/scripts/dependance_check/quick_dependency_check.py +270 -0
- aiecs/scripts/dependance_check/setup_nlp_data.sh +217 -0
- aiecs/scripts/dependance_patch/__init__.py +7 -0
- aiecs/scripts/dependance_patch/fix_weasel/README_WEASEL_PATCH.md +126 -0
- aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
- aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.py +128 -0
- aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.sh +82 -0
- aiecs/scripts/dependance_patch/fix_weasel/patch_weasel_library.sh +188 -0
- aiecs/scripts/dependance_patch/fix_weasel/run_weasel_patch.sh +41 -0
- aiecs/scripts/tools_develop/README.md +449 -0
- aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
- aiecs/scripts/tools_develop/__init__.py +21 -0
- aiecs/scripts/tools_develop/check_type_annotations.py +259 -0
- aiecs/scripts/tools_develop/validate_tool_schemas.py +422 -0
- aiecs/scripts/tools_develop/verify_tools.py +356 -0
- aiecs/tasks/__init__.py +1 -0
- aiecs/tasks/worker.py +172 -0
- aiecs/tools/__init__.py +299 -0
- aiecs/tools/apisource/__init__.py +99 -0
- aiecs/tools/apisource/intelligence/__init__.py +19 -0
- aiecs/tools/apisource/intelligence/data_fusion.py +381 -0
- aiecs/tools/apisource/intelligence/query_analyzer.py +413 -0
- aiecs/tools/apisource/intelligence/search_enhancer.py +388 -0
- aiecs/tools/apisource/monitoring/__init__.py +9 -0
- aiecs/tools/apisource/monitoring/metrics.py +303 -0
- aiecs/tools/apisource/providers/__init__.py +115 -0
- aiecs/tools/apisource/providers/base.py +664 -0
- aiecs/tools/apisource/providers/census.py +401 -0
- aiecs/tools/apisource/providers/fred.py +564 -0
- aiecs/tools/apisource/providers/newsapi.py +412 -0
- aiecs/tools/apisource/providers/worldbank.py +357 -0
- aiecs/tools/apisource/reliability/__init__.py +12 -0
- aiecs/tools/apisource/reliability/error_handler.py +375 -0
- aiecs/tools/apisource/reliability/fallback_strategy.py +391 -0
- aiecs/tools/apisource/tool.py +850 -0
- aiecs/tools/apisource/utils/__init__.py +9 -0
- aiecs/tools/apisource/utils/validators.py +338 -0
- aiecs/tools/base_tool.py +201 -0
- aiecs/tools/docs/__init__.py +121 -0
- aiecs/tools/docs/ai_document_orchestrator.py +599 -0
- aiecs/tools/docs/ai_document_writer_orchestrator.py +2403 -0
- aiecs/tools/docs/content_insertion_tool.py +1333 -0
- aiecs/tools/docs/document_creator_tool.py +1317 -0
- aiecs/tools/docs/document_layout_tool.py +1166 -0
- aiecs/tools/docs/document_parser_tool.py +994 -0
- aiecs/tools/docs/document_writer_tool.py +1818 -0
- aiecs/tools/knowledge_graph/__init__.py +17 -0
- aiecs/tools/knowledge_graph/graph_reasoning_tool.py +734 -0
- aiecs/tools/knowledge_graph/graph_search_tool.py +923 -0
- aiecs/tools/knowledge_graph/kg_builder_tool.py +476 -0
- aiecs/tools/langchain_adapter.py +542 -0
- aiecs/tools/schema_generator.py +275 -0
- aiecs/tools/search_tool/__init__.py +100 -0
- aiecs/tools/search_tool/analyzers.py +589 -0
- aiecs/tools/search_tool/cache.py +260 -0
- aiecs/tools/search_tool/constants.py +128 -0
- aiecs/tools/search_tool/context.py +216 -0
- aiecs/tools/search_tool/core.py +749 -0
- aiecs/tools/search_tool/deduplicator.py +123 -0
- aiecs/tools/search_tool/error_handler.py +271 -0
- aiecs/tools/search_tool/metrics.py +371 -0
- aiecs/tools/search_tool/rate_limiter.py +178 -0
- aiecs/tools/search_tool/schemas.py +277 -0
- aiecs/tools/statistics/__init__.py +80 -0
- aiecs/tools/statistics/ai_data_analysis_orchestrator.py +643 -0
- aiecs/tools/statistics/ai_insight_generator_tool.py +505 -0
- aiecs/tools/statistics/ai_report_orchestrator_tool.py +694 -0
- aiecs/tools/statistics/data_loader_tool.py +564 -0
- aiecs/tools/statistics/data_profiler_tool.py +658 -0
- aiecs/tools/statistics/data_transformer_tool.py +573 -0
- aiecs/tools/statistics/data_visualizer_tool.py +495 -0
- aiecs/tools/statistics/model_trainer_tool.py +487 -0
- aiecs/tools/statistics/statistical_analyzer_tool.py +459 -0
- aiecs/tools/task_tools/__init__.py +86 -0
- aiecs/tools/task_tools/chart_tool.py +732 -0
- aiecs/tools/task_tools/classfire_tool.py +922 -0
- aiecs/tools/task_tools/image_tool.py +447 -0
- aiecs/tools/task_tools/office_tool.py +684 -0
- aiecs/tools/task_tools/pandas_tool.py +635 -0
- aiecs/tools/task_tools/report_tool.py +635 -0
- aiecs/tools/task_tools/research_tool.py +392 -0
- aiecs/tools/task_tools/scraper_tool.py +715 -0
- aiecs/tools/task_tools/stats_tool.py +688 -0
- aiecs/tools/temp_file_manager.py +130 -0
- aiecs/tools/tool_executor/__init__.py +37 -0
- aiecs/tools/tool_executor/tool_executor.py +881 -0
- aiecs/utils/LLM_output_structor.py +445 -0
- aiecs/utils/__init__.py +34 -0
- aiecs/utils/base_callback.py +47 -0
- aiecs/utils/cache_provider.py +695 -0
- aiecs/utils/execution_utils.py +184 -0
- aiecs/utils/logging.py +1 -0
- aiecs/utils/prompt_loader.py +14 -0
- aiecs/utils/token_usage_repository.py +323 -0
- aiecs/ws/__init__.py +0 -0
- aiecs/ws/socket_server.py +52 -0
- aiecs-1.5.1.dist-info/METADATA +608 -0
- aiecs-1.5.1.dist-info/RECORD +302 -0
- aiecs-1.5.1.dist-info/WHEEL +5 -0
- aiecs-1.5.1.dist-info/entry_points.txt +10 -0
- aiecs-1.5.1.dist-info/licenses/LICENSE +225 -0
- aiecs-1.5.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,398 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Text Similarity Utilities
|
|
3
|
+
|
|
4
|
+
Provides various text similarity and matching functions for knowledge graph operations.
|
|
5
|
+
Includes BM25, Jaccard, cosine similarity, Levenshtein distance, and fuzzy matching.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
import math
|
|
10
|
+
from typing import List, Optional, Tuple
|
|
11
|
+
from collections import Counter
|
|
12
|
+
from difflib import SequenceMatcher
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BM25Scorer:
|
|
16
|
+
"""
|
|
17
|
+
BM25 (Best Matching 25) scorer for text similarity
|
|
18
|
+
|
|
19
|
+
BM25 is a ranking function used to estimate the relevance of documents
|
|
20
|
+
to a given search query. It's an improvement over TF-IDF.
|
|
21
|
+
|
|
22
|
+
Example:
|
|
23
|
+
```python
|
|
24
|
+
scorer = BM25Scorer(corpus=[
|
|
25
|
+
"The quick brown fox jumps over the lazy dog",
|
|
26
|
+
"A quick brown dog jumps over a lazy fox",
|
|
27
|
+
"The lazy dog sleeps all day"
|
|
28
|
+
])
|
|
29
|
+
|
|
30
|
+
scores = scorer.score("quick brown fox")
|
|
31
|
+
# Returns scores for each document in corpus
|
|
32
|
+
```
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
corpus: List[str],
|
|
38
|
+
k1: float = 1.5,
|
|
39
|
+
b: float = 0.75,
|
|
40
|
+
tokenizer: Optional[callable] = None,
|
|
41
|
+
):
|
|
42
|
+
"""
|
|
43
|
+
Initialize BM25 scorer
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
corpus: List of documents to score against
|
|
47
|
+
k1: Term frequency saturation parameter (default: 1.5)
|
|
48
|
+
b: Length normalization parameter (default: 0.75)
|
|
49
|
+
tokenizer: Optional tokenizer function (default: simple word split)
|
|
50
|
+
"""
|
|
51
|
+
self.k1 = k1
|
|
52
|
+
self.b = b
|
|
53
|
+
self.tokenizer = tokenizer or self._default_tokenizer
|
|
54
|
+
|
|
55
|
+
# Tokenize corpus
|
|
56
|
+
self.documents = [self.tokenizer(doc) for doc in corpus]
|
|
57
|
+
self.doc_count = len(self.documents)
|
|
58
|
+
|
|
59
|
+
# Calculate document lengths
|
|
60
|
+
self.doc_lengths = [len(doc) for doc in self.documents]
|
|
61
|
+
self.avg_doc_length = sum(self.doc_lengths) / self.doc_count if self.doc_count > 0 else 0
|
|
62
|
+
|
|
63
|
+
# Build term frequency dictionary
|
|
64
|
+
self.term_freqs = []
|
|
65
|
+
self.doc_freqs = Counter()
|
|
66
|
+
|
|
67
|
+
for doc in self.documents:
|
|
68
|
+
tf = Counter(doc)
|
|
69
|
+
self.term_freqs.append(tf)
|
|
70
|
+
for term in set(doc):
|
|
71
|
+
self.doc_freqs[term] += 1
|
|
72
|
+
|
|
73
|
+
# Calculate IDF (Inverse Document Frequency)
|
|
74
|
+
self.idf = {}
|
|
75
|
+
for term, df in self.doc_freqs.items():
|
|
76
|
+
self.idf[term] = math.log((self.doc_count - df + 0.5) / (df + 0.5) + 1.0)
|
|
77
|
+
|
|
78
|
+
def _default_tokenizer(self, text: str) -> List[str]:
|
|
79
|
+
"""Default tokenizer: lowercase and split on whitespace"""
|
|
80
|
+
return re.findall(r"\w+", text.lower())
|
|
81
|
+
|
|
82
|
+
def score(self, query: str) -> List[float]:
|
|
83
|
+
"""
|
|
84
|
+
Score documents against query
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
query: Query string
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
List of BM25 scores for each document
|
|
91
|
+
"""
|
|
92
|
+
query_terms = self.tokenizer(query)
|
|
93
|
+
scores = []
|
|
94
|
+
|
|
95
|
+
for i, doc in enumerate(self.documents):
|
|
96
|
+
score = 0.0
|
|
97
|
+
doc_length = self.doc_lengths[i]
|
|
98
|
+
term_freq = self.term_freqs[i]
|
|
99
|
+
|
|
100
|
+
for term in query_terms:
|
|
101
|
+
if term in term_freq:
|
|
102
|
+
tf = term_freq[term]
|
|
103
|
+
idf = self.idf.get(term, 0.0)
|
|
104
|
+
|
|
105
|
+
# BM25 formula
|
|
106
|
+
numerator = idf * tf * (self.k1 + 1)
|
|
107
|
+
denominator = tf + self.k1 * (
|
|
108
|
+
1 - self.b + self.b * (doc_length / self.avg_doc_length)
|
|
109
|
+
)
|
|
110
|
+
score += numerator / denominator
|
|
111
|
+
|
|
112
|
+
scores.append(score)
|
|
113
|
+
|
|
114
|
+
return scores
|
|
115
|
+
|
|
116
|
+
def get_top_n(self, query: str, n: int = 10) -> List[Tuple[int, float]]:
|
|
117
|
+
"""
|
|
118
|
+
Get top N documents by BM25 score
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
query: Query string
|
|
122
|
+
n: Number of top results to return
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
List of (document_index, score) tuples, sorted by score descending
|
|
126
|
+
"""
|
|
127
|
+
scores = self.score(query)
|
|
128
|
+
indexed_scores = [(i, score) for i, score in enumerate(scores)]
|
|
129
|
+
indexed_scores.sort(key=lambda x: x[1], reverse=True)
|
|
130
|
+
return indexed_scores[:n]
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def jaccard_similarity(set1: set, set2: set) -> float:
|
|
134
|
+
"""
|
|
135
|
+
Calculate Jaccard similarity between two sets
|
|
136
|
+
|
|
137
|
+
Jaccard similarity = |A ∩ B| / |A ∪ B|
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
set1: First set
|
|
141
|
+
set2: Second set
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
Jaccard similarity score (0.0 to 1.0)
|
|
145
|
+
"""
|
|
146
|
+
if not set1 and not set2:
|
|
147
|
+
return 1.0
|
|
148
|
+
|
|
149
|
+
intersection = len(set1 & set2)
|
|
150
|
+
union = len(set1 | set2)
|
|
151
|
+
|
|
152
|
+
if union == 0:
|
|
153
|
+
return 0.0
|
|
154
|
+
|
|
155
|
+
return intersection / union
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def jaccard_similarity_text(text1: str, text2: str, tokenizer: Optional[callable] = None) -> float:
|
|
159
|
+
"""
|
|
160
|
+
Calculate Jaccard similarity between two text strings
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
text1: First text string
|
|
164
|
+
text2: Second text string
|
|
165
|
+
tokenizer: Optional tokenizer function (default: word split)
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
Jaccard similarity score (0.0 to 1.0)
|
|
169
|
+
"""
|
|
170
|
+
if tokenizer is None:
|
|
171
|
+
|
|
172
|
+
def tokenizer(t):
|
|
173
|
+
return set(re.findall(r"\w+", t.lower()))
|
|
174
|
+
|
|
175
|
+
else:
|
|
176
|
+
# Wrap tokenizer to ensure it returns a set
|
|
177
|
+
original_tokenizer = tokenizer
|
|
178
|
+
|
|
179
|
+
def tokenizer(t):
|
|
180
|
+
return set(original_tokenizer(t))
|
|
181
|
+
|
|
182
|
+
set1 = tokenizer(text1)
|
|
183
|
+
set2 = tokenizer(text2)
|
|
184
|
+
|
|
185
|
+
return jaccard_similarity(set1, set2)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def cosine_similarity_text(text1: str, text2: str, tokenizer: Optional[callable] = None) -> float:
|
|
189
|
+
"""
|
|
190
|
+
Calculate cosine similarity between two text strings
|
|
191
|
+
|
|
192
|
+
Cosine similarity measures the cosine of the angle between two vectors
|
|
193
|
+
in a multi-dimensional space. For text, vectors are TF-IDF representations.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
text1: First text string
|
|
197
|
+
text2: Second text string
|
|
198
|
+
tokenizer: Optional tokenizer function (default: word split)
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
Cosine similarity score (0.0 to 1.0)
|
|
202
|
+
"""
|
|
203
|
+
if tokenizer is None:
|
|
204
|
+
|
|
205
|
+
def tokenizer(t):
|
|
206
|
+
return re.findall(r"\w+", t.lower())
|
|
207
|
+
|
|
208
|
+
tokens1 = tokenizer(text1)
|
|
209
|
+
tokens2 = tokenizer(text2)
|
|
210
|
+
|
|
211
|
+
# Build vocabulary
|
|
212
|
+
vocab = set(tokens1) | set(tokens2)
|
|
213
|
+
|
|
214
|
+
if not vocab:
|
|
215
|
+
return 1.0 if not text1 and not text2 else 0.0
|
|
216
|
+
|
|
217
|
+
# Create term frequency vectors
|
|
218
|
+
tf1 = Counter(tokens1)
|
|
219
|
+
tf2 = Counter(tokens2)
|
|
220
|
+
|
|
221
|
+
# Calculate dot product and magnitudes
|
|
222
|
+
dot_product = sum(tf1.get(term, 0) * tf2.get(term, 0) for term in vocab)
|
|
223
|
+
magnitude1 = math.sqrt(sum(tf1.get(term, 0) ** 2 for term in vocab))
|
|
224
|
+
magnitude2 = math.sqrt(sum(tf2.get(term, 0) ** 2 for term in vocab))
|
|
225
|
+
|
|
226
|
+
if magnitude1 == 0 or magnitude2 == 0:
|
|
227
|
+
return 0.0
|
|
228
|
+
|
|
229
|
+
similarity = dot_product / (magnitude1 * magnitude2)
|
|
230
|
+
# Handle floating point precision issues
|
|
231
|
+
return min(1.0, max(0.0, similarity))
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def levenshtein_distance(s1: str, s2: str) -> int:
|
|
235
|
+
"""
|
|
236
|
+
Calculate Levenshtein distance (edit distance) between two strings
|
|
237
|
+
|
|
238
|
+
Levenshtein distance is the minimum number of single-character edits
|
|
239
|
+
(insertions, deletions, or substitutions) required to change one string
|
|
240
|
+
into another.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
s1: First string
|
|
244
|
+
s2: Second string
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
Levenshtein distance (0 = identical, higher = more different)
|
|
248
|
+
"""
|
|
249
|
+
if len(s1) < len(s2):
|
|
250
|
+
return levenshtein_distance(s2, s1)
|
|
251
|
+
|
|
252
|
+
if len(s2) == 0:
|
|
253
|
+
return len(s1)
|
|
254
|
+
|
|
255
|
+
# Use dynamic programming
|
|
256
|
+
previous_row = list(range(len(s2) + 1))
|
|
257
|
+
|
|
258
|
+
for i, c1 in enumerate(s1):
|
|
259
|
+
current_row = [i + 1]
|
|
260
|
+
for j, c2 in enumerate(s2):
|
|
261
|
+
insertions = previous_row[j + 1] + 1
|
|
262
|
+
deletions = current_row[j] + 1
|
|
263
|
+
substitutions = previous_row[j] + (c1 != c2)
|
|
264
|
+
current_row.append(min(insertions, deletions, substitutions))
|
|
265
|
+
previous_row = current_row
|
|
266
|
+
|
|
267
|
+
return previous_row[-1]
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def normalized_levenshtein_similarity(s1: str, s2: str) -> float:
|
|
271
|
+
"""
|
|
272
|
+
Calculate normalized Levenshtein similarity (0.0 to 1.0)
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
s1: First string
|
|
276
|
+
s2: Second string
|
|
277
|
+
|
|
278
|
+
Returns:
|
|
279
|
+
Normalized similarity score (1.0 = identical, 0.0 = completely different)
|
|
280
|
+
"""
|
|
281
|
+
max_len = max(len(s1), len(s2))
|
|
282
|
+
if max_len == 0:
|
|
283
|
+
return 1.0
|
|
284
|
+
|
|
285
|
+
distance = levenshtein_distance(s1, s2)
|
|
286
|
+
return 1.0 - (distance / max_len)
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def fuzzy_match(
|
|
290
|
+
query: str,
|
|
291
|
+
candidates: List[str],
|
|
292
|
+
threshold: float = 0.6,
|
|
293
|
+
method: str = "jaccard",
|
|
294
|
+
) -> List[Tuple[str, float]]:
|
|
295
|
+
"""
|
|
296
|
+
Find fuzzy matches for a query string in a list of candidates
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
query: Query string to match
|
|
300
|
+
candidates: List of candidate strings
|
|
301
|
+
threshold: Minimum similarity threshold (0.0 to 1.0)
|
|
302
|
+
method: Similarity method ("jaccard", "cosine", "levenshtein", "ratio")
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
List of (candidate, similarity_score) tuples above threshold,
|
|
306
|
+
sorted by score descending
|
|
307
|
+
"""
|
|
308
|
+
results = []
|
|
309
|
+
|
|
310
|
+
for candidate in candidates:
|
|
311
|
+
if method == "jaccard":
|
|
312
|
+
score = jaccard_similarity_text(query, candidate)
|
|
313
|
+
elif method == "cosine":
|
|
314
|
+
score = cosine_similarity_text(query, candidate)
|
|
315
|
+
elif method == "levenshtein":
|
|
316
|
+
score = normalized_levenshtein_similarity(query, candidate)
|
|
317
|
+
elif method == "ratio":
|
|
318
|
+
# Use SequenceMatcher ratio (built-in fuzzy matching)
|
|
319
|
+
score = SequenceMatcher(None, query.lower(), candidate.lower()).ratio()
|
|
320
|
+
else:
|
|
321
|
+
raise ValueError(
|
|
322
|
+
f"Unknown method: {method}. Use 'jaccard', 'cosine', 'levenshtein', or 'ratio'"
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
if score >= threshold:
|
|
326
|
+
results.append((candidate, score))
|
|
327
|
+
|
|
328
|
+
# Sort by score descending
|
|
329
|
+
results.sort(key=lambda x: x[1], reverse=True)
|
|
330
|
+
return results
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
class TextSimilarity:
|
|
334
|
+
"""
|
|
335
|
+
Convenience class for text similarity operations
|
|
336
|
+
|
|
337
|
+
Provides a unified interface for various text similarity methods.
|
|
338
|
+
|
|
339
|
+
Example:
|
|
340
|
+
```python
|
|
341
|
+
similarity = TextSimilarity()
|
|
342
|
+
|
|
343
|
+
# Jaccard similarity
|
|
344
|
+
score = similarity.jaccard("hello world", "world hello")
|
|
345
|
+
|
|
346
|
+
# Cosine similarity
|
|
347
|
+
score = similarity.cosine("machine learning", "deep learning")
|
|
348
|
+
|
|
349
|
+
# Levenshtein distance
|
|
350
|
+
distance = similarity.levenshtein("kitten", "sitting")
|
|
351
|
+
|
|
352
|
+
# Fuzzy matching
|
|
353
|
+
matches = similarity.fuzzy_match(
|
|
354
|
+
"python",
|
|
355
|
+
["python3", "pyton", "java", "pythn"],
|
|
356
|
+
threshold=0.7
|
|
357
|
+
)
|
|
358
|
+
```
|
|
359
|
+
"""
|
|
360
|
+
|
|
361
|
+
def __init__(self, tokenizer: Optional[callable] = None):
|
|
362
|
+
"""
|
|
363
|
+
Initialize TextSimilarity
|
|
364
|
+
|
|
365
|
+
Args:
|
|
366
|
+
tokenizer: Optional tokenizer function for text processing
|
|
367
|
+
"""
|
|
368
|
+
self.tokenizer = tokenizer
|
|
369
|
+
|
|
370
|
+
def jaccard(self, text1: str, text2: str) -> float:
|
|
371
|
+
"""Calculate Jaccard similarity between two texts"""
|
|
372
|
+
return jaccard_similarity_text(text1, text2, self.tokenizer)
|
|
373
|
+
|
|
374
|
+
def cosine(self, text1: str, text2: str) -> float:
|
|
375
|
+
"""Calculate cosine similarity between two texts"""
|
|
376
|
+
return cosine_similarity_text(text1, text2, self.tokenizer)
|
|
377
|
+
|
|
378
|
+
def levenshtein(self, text1: str, text2: str) -> int:
|
|
379
|
+
"""Calculate Levenshtein distance between two texts"""
|
|
380
|
+
return levenshtein_distance(text1, text2)
|
|
381
|
+
|
|
382
|
+
def levenshtein_similarity(self, text1: str, text2: str) -> float:
|
|
383
|
+
"""Calculate normalized Levenshtein similarity"""
|
|
384
|
+
return normalized_levenshtein_similarity(text1, text2)
|
|
385
|
+
|
|
386
|
+
def fuzzy_match(
|
|
387
|
+
self,
|
|
388
|
+
query: str,
|
|
389
|
+
candidates: List[str],
|
|
390
|
+
threshold: float = 0.6,
|
|
391
|
+
method: str = "jaccard",
|
|
392
|
+
) -> List[Tuple[str, float]]:
|
|
393
|
+
"""Find fuzzy matches for a query"""
|
|
394
|
+
return fuzzy_match(query, candidates, threshold, method)
|
|
395
|
+
|
|
396
|
+
def bm25(self, corpus: List[str], k1: float = 1.5, b: float = 0.75) -> BM25Scorer:
|
|
397
|
+
"""Create a BM25 scorer for a corpus"""
|
|
398
|
+
return BM25Scorer(corpus, k1=k1, b=b, tokenizer=self.tokenizer)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Knowledge Graph Traversal Application Layer
|
|
3
|
+
|
|
4
|
+
Advanced traversal algorithms and path ranking utilities.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from aiecs.application.knowledge_graph.traversal.path_scorer import PathScorer
|
|
8
|
+
from aiecs.application.knowledge_graph.traversal.enhanced_traversal import (
|
|
9
|
+
EnhancedTraversal,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"PathScorer",
|
|
14
|
+
"EnhancedTraversal",
|
|
15
|
+
]
|