aiecs 1.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. aiecs/__init__.py +72 -0
  2. aiecs/__main__.py +41 -0
  3. aiecs/aiecs_client.py +469 -0
  4. aiecs/application/__init__.py +10 -0
  5. aiecs/application/executors/__init__.py +10 -0
  6. aiecs/application/executors/operation_executor.py +363 -0
  7. aiecs/application/knowledge_graph/__init__.py +7 -0
  8. aiecs/application/knowledge_graph/builder/__init__.py +37 -0
  9. aiecs/application/knowledge_graph/builder/document_builder.py +375 -0
  10. aiecs/application/knowledge_graph/builder/graph_builder.py +356 -0
  11. aiecs/application/knowledge_graph/builder/schema_mapping.py +531 -0
  12. aiecs/application/knowledge_graph/builder/structured_pipeline.py +443 -0
  13. aiecs/application/knowledge_graph/builder/text_chunker.py +319 -0
  14. aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
  15. aiecs/application/knowledge_graph/extractors/base.py +100 -0
  16. aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +327 -0
  17. aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +349 -0
  18. aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +244 -0
  19. aiecs/application/knowledge_graph/fusion/__init__.py +23 -0
  20. aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +387 -0
  21. aiecs/application/knowledge_graph/fusion/entity_linker.py +343 -0
  22. aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +580 -0
  23. aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +189 -0
  24. aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
  25. aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +344 -0
  26. aiecs/application/knowledge_graph/pattern_matching/query_executor.py +378 -0
  27. aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
  28. aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +199 -0
  29. aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
  30. aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
  31. aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +347 -0
  32. aiecs/application/knowledge_graph/reasoning/inference_engine.py +504 -0
  33. aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +167 -0
  34. aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
  35. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
  36. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +630 -0
  37. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +654 -0
  38. aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +477 -0
  39. aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +390 -0
  40. aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +217 -0
  41. aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +169 -0
  42. aiecs/application/knowledge_graph/reasoning/query_planner.py +872 -0
  43. aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +554 -0
  44. aiecs/application/knowledge_graph/retrieval/__init__.py +19 -0
  45. aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +596 -0
  46. aiecs/application/knowledge_graph/search/__init__.py +59 -0
  47. aiecs/application/knowledge_graph/search/hybrid_search.py +423 -0
  48. aiecs/application/knowledge_graph/search/reranker.py +295 -0
  49. aiecs/application/knowledge_graph/search/reranker_strategies.py +553 -0
  50. aiecs/application/knowledge_graph/search/text_similarity.py +398 -0
  51. aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
  52. aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +329 -0
  53. aiecs/application/knowledge_graph/traversal/path_scorer.py +269 -0
  54. aiecs/application/knowledge_graph/validators/__init__.py +13 -0
  55. aiecs/application/knowledge_graph/validators/relation_validator.py +189 -0
  56. aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
  57. aiecs/application/knowledge_graph/visualization/graph_visualizer.py +321 -0
  58. aiecs/common/__init__.py +9 -0
  59. aiecs/common/knowledge_graph/__init__.py +17 -0
  60. aiecs/common/knowledge_graph/runnable.py +484 -0
  61. aiecs/config/__init__.py +16 -0
  62. aiecs/config/config.py +498 -0
  63. aiecs/config/graph_config.py +137 -0
  64. aiecs/config/registry.py +23 -0
  65. aiecs/core/__init__.py +46 -0
  66. aiecs/core/interface/__init__.py +34 -0
  67. aiecs/core/interface/execution_interface.py +152 -0
  68. aiecs/core/interface/storage_interface.py +171 -0
  69. aiecs/domain/__init__.py +289 -0
  70. aiecs/domain/agent/__init__.py +189 -0
  71. aiecs/domain/agent/base_agent.py +697 -0
  72. aiecs/domain/agent/exceptions.py +103 -0
  73. aiecs/domain/agent/graph_aware_mixin.py +559 -0
  74. aiecs/domain/agent/hybrid_agent.py +490 -0
  75. aiecs/domain/agent/integration/__init__.py +26 -0
  76. aiecs/domain/agent/integration/context_compressor.py +222 -0
  77. aiecs/domain/agent/integration/context_engine_adapter.py +252 -0
  78. aiecs/domain/agent/integration/retry_policy.py +219 -0
  79. aiecs/domain/agent/integration/role_config.py +213 -0
  80. aiecs/domain/agent/knowledge_aware_agent.py +646 -0
  81. aiecs/domain/agent/lifecycle.py +296 -0
  82. aiecs/domain/agent/llm_agent.py +300 -0
  83. aiecs/domain/agent/memory/__init__.py +12 -0
  84. aiecs/domain/agent/memory/conversation.py +197 -0
  85. aiecs/domain/agent/migration/__init__.py +14 -0
  86. aiecs/domain/agent/migration/conversion.py +160 -0
  87. aiecs/domain/agent/migration/legacy_wrapper.py +90 -0
  88. aiecs/domain/agent/models.py +317 -0
  89. aiecs/domain/agent/observability.py +407 -0
  90. aiecs/domain/agent/persistence.py +289 -0
  91. aiecs/domain/agent/prompts/__init__.py +29 -0
  92. aiecs/domain/agent/prompts/builder.py +161 -0
  93. aiecs/domain/agent/prompts/formatters.py +189 -0
  94. aiecs/domain/agent/prompts/template.py +255 -0
  95. aiecs/domain/agent/registry.py +260 -0
  96. aiecs/domain/agent/tool_agent.py +257 -0
  97. aiecs/domain/agent/tools/__init__.py +12 -0
  98. aiecs/domain/agent/tools/schema_generator.py +221 -0
  99. aiecs/domain/community/__init__.py +155 -0
  100. aiecs/domain/community/agent_adapter.py +477 -0
  101. aiecs/domain/community/analytics.py +481 -0
  102. aiecs/domain/community/collaborative_workflow.py +642 -0
  103. aiecs/domain/community/communication_hub.py +645 -0
  104. aiecs/domain/community/community_builder.py +320 -0
  105. aiecs/domain/community/community_integration.py +800 -0
  106. aiecs/domain/community/community_manager.py +813 -0
  107. aiecs/domain/community/decision_engine.py +879 -0
  108. aiecs/domain/community/exceptions.py +225 -0
  109. aiecs/domain/community/models/__init__.py +33 -0
  110. aiecs/domain/community/models/community_models.py +268 -0
  111. aiecs/domain/community/resource_manager.py +457 -0
  112. aiecs/domain/community/shared_context_manager.py +603 -0
  113. aiecs/domain/context/__init__.py +58 -0
  114. aiecs/domain/context/context_engine.py +989 -0
  115. aiecs/domain/context/conversation_models.py +354 -0
  116. aiecs/domain/context/graph_memory.py +467 -0
  117. aiecs/domain/execution/__init__.py +12 -0
  118. aiecs/domain/execution/model.py +57 -0
  119. aiecs/domain/knowledge_graph/__init__.py +19 -0
  120. aiecs/domain/knowledge_graph/models/__init__.py +52 -0
  121. aiecs/domain/knowledge_graph/models/entity.py +130 -0
  122. aiecs/domain/knowledge_graph/models/evidence.py +194 -0
  123. aiecs/domain/knowledge_graph/models/inference_rule.py +186 -0
  124. aiecs/domain/knowledge_graph/models/path.py +179 -0
  125. aiecs/domain/knowledge_graph/models/path_pattern.py +173 -0
  126. aiecs/domain/knowledge_graph/models/query.py +272 -0
  127. aiecs/domain/knowledge_graph/models/query_plan.py +187 -0
  128. aiecs/domain/knowledge_graph/models/relation.py +136 -0
  129. aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
  130. aiecs/domain/knowledge_graph/schema/entity_type.py +135 -0
  131. aiecs/domain/knowledge_graph/schema/graph_schema.py +271 -0
  132. aiecs/domain/knowledge_graph/schema/property_schema.py +155 -0
  133. aiecs/domain/knowledge_graph/schema/relation_type.py +171 -0
  134. aiecs/domain/knowledge_graph/schema/schema_manager.py +496 -0
  135. aiecs/domain/knowledge_graph/schema/type_enums.py +205 -0
  136. aiecs/domain/task/__init__.py +13 -0
  137. aiecs/domain/task/dsl_processor.py +613 -0
  138. aiecs/domain/task/model.py +62 -0
  139. aiecs/domain/task/task_context.py +268 -0
  140. aiecs/infrastructure/__init__.py +24 -0
  141. aiecs/infrastructure/graph_storage/__init__.py +11 -0
  142. aiecs/infrastructure/graph_storage/base.py +601 -0
  143. aiecs/infrastructure/graph_storage/batch_operations.py +449 -0
  144. aiecs/infrastructure/graph_storage/cache.py +429 -0
  145. aiecs/infrastructure/graph_storage/distributed.py +226 -0
  146. aiecs/infrastructure/graph_storage/error_handling.py +390 -0
  147. aiecs/infrastructure/graph_storage/graceful_degradation.py +306 -0
  148. aiecs/infrastructure/graph_storage/health_checks.py +378 -0
  149. aiecs/infrastructure/graph_storage/in_memory.py +514 -0
  150. aiecs/infrastructure/graph_storage/index_optimization.py +483 -0
  151. aiecs/infrastructure/graph_storage/lazy_loading.py +410 -0
  152. aiecs/infrastructure/graph_storage/metrics.py +357 -0
  153. aiecs/infrastructure/graph_storage/migration.py +413 -0
  154. aiecs/infrastructure/graph_storage/pagination.py +471 -0
  155. aiecs/infrastructure/graph_storage/performance_monitoring.py +466 -0
  156. aiecs/infrastructure/graph_storage/postgres.py +871 -0
  157. aiecs/infrastructure/graph_storage/query_optimizer.py +635 -0
  158. aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
  159. aiecs/infrastructure/graph_storage/sqlite.py +623 -0
  160. aiecs/infrastructure/graph_storage/streaming.py +495 -0
  161. aiecs/infrastructure/messaging/__init__.py +13 -0
  162. aiecs/infrastructure/messaging/celery_task_manager.py +383 -0
  163. aiecs/infrastructure/messaging/websocket_manager.py +298 -0
  164. aiecs/infrastructure/monitoring/__init__.py +34 -0
  165. aiecs/infrastructure/monitoring/executor_metrics.py +174 -0
  166. aiecs/infrastructure/monitoring/global_metrics_manager.py +213 -0
  167. aiecs/infrastructure/monitoring/structured_logger.py +48 -0
  168. aiecs/infrastructure/monitoring/tracing_manager.py +410 -0
  169. aiecs/infrastructure/persistence/__init__.py +24 -0
  170. aiecs/infrastructure/persistence/context_engine_client.py +187 -0
  171. aiecs/infrastructure/persistence/database_manager.py +333 -0
  172. aiecs/infrastructure/persistence/file_storage.py +754 -0
  173. aiecs/infrastructure/persistence/redis_client.py +220 -0
  174. aiecs/llm/__init__.py +86 -0
  175. aiecs/llm/callbacks/__init__.py +11 -0
  176. aiecs/llm/callbacks/custom_callbacks.py +264 -0
  177. aiecs/llm/client_factory.py +420 -0
  178. aiecs/llm/clients/__init__.py +33 -0
  179. aiecs/llm/clients/base_client.py +193 -0
  180. aiecs/llm/clients/googleai_client.py +181 -0
  181. aiecs/llm/clients/openai_client.py +131 -0
  182. aiecs/llm/clients/vertex_client.py +437 -0
  183. aiecs/llm/clients/xai_client.py +184 -0
  184. aiecs/llm/config/__init__.py +51 -0
  185. aiecs/llm/config/config_loader.py +275 -0
  186. aiecs/llm/config/config_validator.py +236 -0
  187. aiecs/llm/config/model_config.py +151 -0
  188. aiecs/llm/utils/__init__.py +10 -0
  189. aiecs/llm/utils/validate_config.py +91 -0
  190. aiecs/main.py +363 -0
  191. aiecs/scripts/__init__.py +3 -0
  192. aiecs/scripts/aid/VERSION_MANAGEMENT.md +97 -0
  193. aiecs/scripts/aid/__init__.py +19 -0
  194. aiecs/scripts/aid/version_manager.py +215 -0
  195. aiecs/scripts/dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md +242 -0
  196. aiecs/scripts/dependance_check/README_DEPENDENCY_CHECKER.md +310 -0
  197. aiecs/scripts/dependance_check/__init__.py +17 -0
  198. aiecs/scripts/dependance_check/dependency_checker.py +938 -0
  199. aiecs/scripts/dependance_check/dependency_fixer.py +391 -0
  200. aiecs/scripts/dependance_check/download_nlp_data.py +396 -0
  201. aiecs/scripts/dependance_check/quick_dependency_check.py +270 -0
  202. aiecs/scripts/dependance_check/setup_nlp_data.sh +217 -0
  203. aiecs/scripts/dependance_patch/__init__.py +7 -0
  204. aiecs/scripts/dependance_patch/fix_weasel/README_WEASEL_PATCH.md +126 -0
  205. aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
  206. aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.py +128 -0
  207. aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.sh +82 -0
  208. aiecs/scripts/dependance_patch/fix_weasel/patch_weasel_library.sh +188 -0
  209. aiecs/scripts/dependance_patch/fix_weasel/run_weasel_patch.sh +41 -0
  210. aiecs/scripts/tools_develop/README.md +449 -0
  211. aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
  212. aiecs/scripts/tools_develop/__init__.py +21 -0
  213. aiecs/scripts/tools_develop/check_type_annotations.py +259 -0
  214. aiecs/scripts/tools_develop/validate_tool_schemas.py +422 -0
  215. aiecs/scripts/tools_develop/verify_tools.py +356 -0
  216. aiecs/tasks/__init__.py +1 -0
  217. aiecs/tasks/worker.py +172 -0
  218. aiecs/tools/__init__.py +299 -0
  219. aiecs/tools/apisource/__init__.py +99 -0
  220. aiecs/tools/apisource/intelligence/__init__.py +19 -0
  221. aiecs/tools/apisource/intelligence/data_fusion.py +381 -0
  222. aiecs/tools/apisource/intelligence/query_analyzer.py +413 -0
  223. aiecs/tools/apisource/intelligence/search_enhancer.py +388 -0
  224. aiecs/tools/apisource/monitoring/__init__.py +9 -0
  225. aiecs/tools/apisource/monitoring/metrics.py +303 -0
  226. aiecs/tools/apisource/providers/__init__.py +115 -0
  227. aiecs/tools/apisource/providers/base.py +664 -0
  228. aiecs/tools/apisource/providers/census.py +401 -0
  229. aiecs/tools/apisource/providers/fred.py +564 -0
  230. aiecs/tools/apisource/providers/newsapi.py +412 -0
  231. aiecs/tools/apisource/providers/worldbank.py +357 -0
  232. aiecs/tools/apisource/reliability/__init__.py +12 -0
  233. aiecs/tools/apisource/reliability/error_handler.py +375 -0
  234. aiecs/tools/apisource/reliability/fallback_strategy.py +391 -0
  235. aiecs/tools/apisource/tool.py +850 -0
  236. aiecs/tools/apisource/utils/__init__.py +9 -0
  237. aiecs/tools/apisource/utils/validators.py +338 -0
  238. aiecs/tools/base_tool.py +201 -0
  239. aiecs/tools/docs/__init__.py +121 -0
  240. aiecs/tools/docs/ai_document_orchestrator.py +599 -0
  241. aiecs/tools/docs/ai_document_writer_orchestrator.py +2403 -0
  242. aiecs/tools/docs/content_insertion_tool.py +1333 -0
  243. aiecs/tools/docs/document_creator_tool.py +1317 -0
  244. aiecs/tools/docs/document_layout_tool.py +1166 -0
  245. aiecs/tools/docs/document_parser_tool.py +994 -0
  246. aiecs/tools/docs/document_writer_tool.py +1818 -0
  247. aiecs/tools/knowledge_graph/__init__.py +17 -0
  248. aiecs/tools/knowledge_graph/graph_reasoning_tool.py +734 -0
  249. aiecs/tools/knowledge_graph/graph_search_tool.py +923 -0
  250. aiecs/tools/knowledge_graph/kg_builder_tool.py +476 -0
  251. aiecs/tools/langchain_adapter.py +542 -0
  252. aiecs/tools/schema_generator.py +275 -0
  253. aiecs/tools/search_tool/__init__.py +100 -0
  254. aiecs/tools/search_tool/analyzers.py +589 -0
  255. aiecs/tools/search_tool/cache.py +260 -0
  256. aiecs/tools/search_tool/constants.py +128 -0
  257. aiecs/tools/search_tool/context.py +216 -0
  258. aiecs/tools/search_tool/core.py +749 -0
  259. aiecs/tools/search_tool/deduplicator.py +123 -0
  260. aiecs/tools/search_tool/error_handler.py +271 -0
  261. aiecs/tools/search_tool/metrics.py +371 -0
  262. aiecs/tools/search_tool/rate_limiter.py +178 -0
  263. aiecs/tools/search_tool/schemas.py +277 -0
  264. aiecs/tools/statistics/__init__.py +80 -0
  265. aiecs/tools/statistics/ai_data_analysis_orchestrator.py +643 -0
  266. aiecs/tools/statistics/ai_insight_generator_tool.py +505 -0
  267. aiecs/tools/statistics/ai_report_orchestrator_tool.py +694 -0
  268. aiecs/tools/statistics/data_loader_tool.py +564 -0
  269. aiecs/tools/statistics/data_profiler_tool.py +658 -0
  270. aiecs/tools/statistics/data_transformer_tool.py +573 -0
  271. aiecs/tools/statistics/data_visualizer_tool.py +495 -0
  272. aiecs/tools/statistics/model_trainer_tool.py +487 -0
  273. aiecs/tools/statistics/statistical_analyzer_tool.py +459 -0
  274. aiecs/tools/task_tools/__init__.py +86 -0
  275. aiecs/tools/task_tools/chart_tool.py +732 -0
  276. aiecs/tools/task_tools/classfire_tool.py +922 -0
  277. aiecs/tools/task_tools/image_tool.py +447 -0
  278. aiecs/tools/task_tools/office_tool.py +684 -0
  279. aiecs/tools/task_tools/pandas_tool.py +635 -0
  280. aiecs/tools/task_tools/report_tool.py +635 -0
  281. aiecs/tools/task_tools/research_tool.py +392 -0
  282. aiecs/tools/task_tools/scraper_tool.py +715 -0
  283. aiecs/tools/task_tools/stats_tool.py +688 -0
  284. aiecs/tools/temp_file_manager.py +130 -0
  285. aiecs/tools/tool_executor/__init__.py +37 -0
  286. aiecs/tools/tool_executor/tool_executor.py +881 -0
  287. aiecs/utils/LLM_output_structor.py +445 -0
  288. aiecs/utils/__init__.py +34 -0
  289. aiecs/utils/base_callback.py +47 -0
  290. aiecs/utils/cache_provider.py +695 -0
  291. aiecs/utils/execution_utils.py +184 -0
  292. aiecs/utils/logging.py +1 -0
  293. aiecs/utils/prompt_loader.py +14 -0
  294. aiecs/utils/token_usage_repository.py +323 -0
  295. aiecs/ws/__init__.py +0 -0
  296. aiecs/ws/socket_server.py +52 -0
  297. aiecs-1.5.1.dist-info/METADATA +608 -0
  298. aiecs-1.5.1.dist-info/RECORD +302 -0
  299. aiecs-1.5.1.dist-info/WHEEL +5 -0
  300. aiecs-1.5.1.dist-info/entry_points.txt +10 -0
  301. aiecs-1.5.1.dist-info/licenses/LICENSE +225 -0
  302. aiecs-1.5.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,398 @@
1
+ """
2
+ Text Similarity Utilities
3
+
4
+ Provides various text similarity and matching functions for knowledge graph operations.
5
+ Includes BM25, Jaccard, cosine similarity, Levenshtein distance, and fuzzy matching.
6
+ """
7
+
8
+ import re
9
+ import math
10
+ from typing import List, Optional, Tuple
11
+ from collections import Counter
12
+ from difflib import SequenceMatcher
13
+
14
+
15
+ class BM25Scorer:
16
+ """
17
+ BM25 (Best Matching 25) scorer for text similarity
18
+
19
+ BM25 is a ranking function used to estimate the relevance of documents
20
+ to a given search query. It's an improvement over TF-IDF.
21
+
22
+ Example:
23
+ ```python
24
+ scorer = BM25Scorer(corpus=[
25
+ "The quick brown fox jumps over the lazy dog",
26
+ "A quick brown dog jumps over a lazy fox",
27
+ "The lazy dog sleeps all day"
28
+ ])
29
+
30
+ scores = scorer.score("quick brown fox")
31
+ # Returns scores for each document in corpus
32
+ ```
33
+ """
34
+
35
+ def __init__(
36
+ self,
37
+ corpus: List[str],
38
+ k1: float = 1.5,
39
+ b: float = 0.75,
40
+ tokenizer: Optional[callable] = None,
41
+ ):
42
+ """
43
+ Initialize BM25 scorer
44
+
45
+ Args:
46
+ corpus: List of documents to score against
47
+ k1: Term frequency saturation parameter (default: 1.5)
48
+ b: Length normalization parameter (default: 0.75)
49
+ tokenizer: Optional tokenizer function (default: simple word split)
50
+ """
51
+ self.k1 = k1
52
+ self.b = b
53
+ self.tokenizer = tokenizer or self._default_tokenizer
54
+
55
+ # Tokenize corpus
56
+ self.documents = [self.tokenizer(doc) for doc in corpus]
57
+ self.doc_count = len(self.documents)
58
+
59
+ # Calculate document lengths
60
+ self.doc_lengths = [len(doc) for doc in self.documents]
61
+ self.avg_doc_length = sum(self.doc_lengths) / self.doc_count if self.doc_count > 0 else 0
62
+
63
+ # Build term frequency dictionary
64
+ self.term_freqs = []
65
+ self.doc_freqs = Counter()
66
+
67
+ for doc in self.documents:
68
+ tf = Counter(doc)
69
+ self.term_freqs.append(tf)
70
+ for term in set(doc):
71
+ self.doc_freqs[term] += 1
72
+
73
+ # Calculate IDF (Inverse Document Frequency)
74
+ self.idf = {}
75
+ for term, df in self.doc_freqs.items():
76
+ self.idf[term] = math.log((self.doc_count - df + 0.5) / (df + 0.5) + 1.0)
77
+
78
+ def _default_tokenizer(self, text: str) -> List[str]:
79
+ """Default tokenizer: lowercase and split on whitespace"""
80
+ return re.findall(r"\w+", text.lower())
81
+
82
+ def score(self, query: str) -> List[float]:
83
+ """
84
+ Score documents against query
85
+
86
+ Args:
87
+ query: Query string
88
+
89
+ Returns:
90
+ List of BM25 scores for each document
91
+ """
92
+ query_terms = self.tokenizer(query)
93
+ scores = []
94
+
95
+ for i, doc in enumerate(self.documents):
96
+ score = 0.0
97
+ doc_length = self.doc_lengths[i]
98
+ term_freq = self.term_freqs[i]
99
+
100
+ for term in query_terms:
101
+ if term in term_freq:
102
+ tf = term_freq[term]
103
+ idf = self.idf.get(term, 0.0)
104
+
105
+ # BM25 formula
106
+ numerator = idf * tf * (self.k1 + 1)
107
+ denominator = tf + self.k1 * (
108
+ 1 - self.b + self.b * (doc_length / self.avg_doc_length)
109
+ )
110
+ score += numerator / denominator
111
+
112
+ scores.append(score)
113
+
114
+ return scores
115
+
116
+ def get_top_n(self, query: str, n: int = 10) -> List[Tuple[int, float]]:
117
+ """
118
+ Get top N documents by BM25 score
119
+
120
+ Args:
121
+ query: Query string
122
+ n: Number of top results to return
123
+
124
+ Returns:
125
+ List of (document_index, score) tuples, sorted by score descending
126
+ """
127
+ scores = self.score(query)
128
+ indexed_scores = [(i, score) for i, score in enumerate(scores)]
129
+ indexed_scores.sort(key=lambda x: x[1], reverse=True)
130
+ return indexed_scores[:n]
131
+
132
+
133
+ def jaccard_similarity(set1: set, set2: set) -> float:
134
+ """
135
+ Calculate Jaccard similarity between two sets
136
+
137
+ Jaccard similarity = |A ∩ B| / |A ∪ B|
138
+
139
+ Args:
140
+ set1: First set
141
+ set2: Second set
142
+
143
+ Returns:
144
+ Jaccard similarity score (0.0 to 1.0)
145
+ """
146
+ if not set1 and not set2:
147
+ return 1.0
148
+
149
+ intersection = len(set1 & set2)
150
+ union = len(set1 | set2)
151
+
152
+ if union == 0:
153
+ return 0.0
154
+
155
+ return intersection / union
156
+
157
+
158
+ def jaccard_similarity_text(text1: str, text2: str, tokenizer: Optional[callable] = None) -> float:
159
+ """
160
+ Calculate Jaccard similarity between two text strings
161
+
162
+ Args:
163
+ text1: First text string
164
+ text2: Second text string
165
+ tokenizer: Optional tokenizer function (default: word split)
166
+
167
+ Returns:
168
+ Jaccard similarity score (0.0 to 1.0)
169
+ """
170
+ if tokenizer is None:
171
+
172
+ def tokenizer(t):
173
+ return set(re.findall(r"\w+", t.lower()))
174
+
175
+ else:
176
+ # Wrap tokenizer to ensure it returns a set
177
+ original_tokenizer = tokenizer
178
+
179
+ def tokenizer(t):
180
+ return set(original_tokenizer(t))
181
+
182
+ set1 = tokenizer(text1)
183
+ set2 = tokenizer(text2)
184
+
185
+ return jaccard_similarity(set1, set2)
186
+
187
+
188
+ def cosine_similarity_text(text1: str, text2: str, tokenizer: Optional[callable] = None) -> float:
189
+ """
190
+ Calculate cosine similarity between two text strings
191
+
192
+ Cosine similarity measures the cosine of the angle between two vectors
193
+ in a multi-dimensional space. For text, vectors are TF-IDF representations.
194
+
195
+ Args:
196
+ text1: First text string
197
+ text2: Second text string
198
+ tokenizer: Optional tokenizer function (default: word split)
199
+
200
+ Returns:
201
+ Cosine similarity score (0.0 to 1.0)
202
+ """
203
+ if tokenizer is None:
204
+
205
+ def tokenizer(t):
206
+ return re.findall(r"\w+", t.lower())
207
+
208
+ tokens1 = tokenizer(text1)
209
+ tokens2 = tokenizer(text2)
210
+
211
+ # Build vocabulary
212
+ vocab = set(tokens1) | set(tokens2)
213
+
214
+ if not vocab:
215
+ return 1.0 if not text1 and not text2 else 0.0
216
+
217
+ # Create term frequency vectors
218
+ tf1 = Counter(tokens1)
219
+ tf2 = Counter(tokens2)
220
+
221
+ # Calculate dot product and magnitudes
222
+ dot_product = sum(tf1.get(term, 0) * tf2.get(term, 0) for term in vocab)
223
+ magnitude1 = math.sqrt(sum(tf1.get(term, 0) ** 2 for term in vocab))
224
+ magnitude2 = math.sqrt(sum(tf2.get(term, 0) ** 2 for term in vocab))
225
+
226
+ if magnitude1 == 0 or magnitude2 == 0:
227
+ return 0.0
228
+
229
+ similarity = dot_product / (magnitude1 * magnitude2)
230
+ # Handle floating point precision issues
231
+ return min(1.0, max(0.0, similarity))
232
+
233
+
234
+ def levenshtein_distance(s1: str, s2: str) -> int:
235
+ """
236
+ Calculate Levenshtein distance (edit distance) between two strings
237
+
238
+ Levenshtein distance is the minimum number of single-character edits
239
+ (insertions, deletions, or substitutions) required to change one string
240
+ into another.
241
+
242
+ Args:
243
+ s1: First string
244
+ s2: Second string
245
+
246
+ Returns:
247
+ Levenshtein distance (0 = identical, higher = more different)
248
+ """
249
+ if len(s1) < len(s2):
250
+ return levenshtein_distance(s2, s1)
251
+
252
+ if len(s2) == 0:
253
+ return len(s1)
254
+
255
+ # Use dynamic programming
256
+ previous_row = list(range(len(s2) + 1))
257
+
258
+ for i, c1 in enumerate(s1):
259
+ current_row = [i + 1]
260
+ for j, c2 in enumerate(s2):
261
+ insertions = previous_row[j + 1] + 1
262
+ deletions = current_row[j] + 1
263
+ substitutions = previous_row[j] + (c1 != c2)
264
+ current_row.append(min(insertions, deletions, substitutions))
265
+ previous_row = current_row
266
+
267
+ return previous_row[-1]
268
+
269
+
270
+ def normalized_levenshtein_similarity(s1: str, s2: str) -> float:
271
+ """
272
+ Calculate normalized Levenshtein similarity (0.0 to 1.0)
273
+
274
+ Args:
275
+ s1: First string
276
+ s2: Second string
277
+
278
+ Returns:
279
+ Normalized similarity score (1.0 = identical, 0.0 = completely different)
280
+ """
281
+ max_len = max(len(s1), len(s2))
282
+ if max_len == 0:
283
+ return 1.0
284
+
285
+ distance = levenshtein_distance(s1, s2)
286
+ return 1.0 - (distance / max_len)
287
+
288
+
289
+ def fuzzy_match(
290
+ query: str,
291
+ candidates: List[str],
292
+ threshold: float = 0.6,
293
+ method: str = "jaccard",
294
+ ) -> List[Tuple[str, float]]:
295
+ """
296
+ Find fuzzy matches for a query string in a list of candidates
297
+
298
+ Args:
299
+ query: Query string to match
300
+ candidates: List of candidate strings
301
+ threshold: Minimum similarity threshold (0.0 to 1.0)
302
+ method: Similarity method ("jaccard", "cosine", "levenshtein", "ratio")
303
+
304
+ Returns:
305
+ List of (candidate, similarity_score) tuples above threshold,
306
+ sorted by score descending
307
+ """
308
+ results = []
309
+
310
+ for candidate in candidates:
311
+ if method == "jaccard":
312
+ score = jaccard_similarity_text(query, candidate)
313
+ elif method == "cosine":
314
+ score = cosine_similarity_text(query, candidate)
315
+ elif method == "levenshtein":
316
+ score = normalized_levenshtein_similarity(query, candidate)
317
+ elif method == "ratio":
318
+ # Use SequenceMatcher ratio (built-in fuzzy matching)
319
+ score = SequenceMatcher(None, query.lower(), candidate.lower()).ratio()
320
+ else:
321
+ raise ValueError(
322
+ f"Unknown method: {method}. Use 'jaccard', 'cosine', 'levenshtein', or 'ratio'"
323
+ )
324
+
325
+ if score >= threshold:
326
+ results.append((candidate, score))
327
+
328
+ # Sort by score descending
329
+ results.sort(key=lambda x: x[1], reverse=True)
330
+ return results
331
+
332
+
333
+ class TextSimilarity:
334
+ """
335
+ Convenience class for text similarity operations
336
+
337
+ Provides a unified interface for various text similarity methods.
338
+
339
+ Example:
340
+ ```python
341
+ similarity = TextSimilarity()
342
+
343
+ # Jaccard similarity
344
+ score = similarity.jaccard("hello world", "world hello")
345
+
346
+ # Cosine similarity
347
+ score = similarity.cosine("machine learning", "deep learning")
348
+
349
+ # Levenshtein distance
350
+ distance = similarity.levenshtein("kitten", "sitting")
351
+
352
+ # Fuzzy matching
353
+ matches = similarity.fuzzy_match(
354
+ "python",
355
+ ["python3", "pyton", "java", "pythn"],
356
+ threshold=0.7
357
+ )
358
+ ```
359
+ """
360
+
361
+ def __init__(self, tokenizer: Optional[callable] = None):
362
+ """
363
+ Initialize TextSimilarity
364
+
365
+ Args:
366
+ tokenizer: Optional tokenizer function for text processing
367
+ """
368
+ self.tokenizer = tokenizer
369
+
370
+ def jaccard(self, text1: str, text2: str) -> float:
371
+ """Calculate Jaccard similarity between two texts"""
372
+ return jaccard_similarity_text(text1, text2, self.tokenizer)
373
+
374
+ def cosine(self, text1: str, text2: str) -> float:
375
+ """Calculate cosine similarity between two texts"""
376
+ return cosine_similarity_text(text1, text2, self.tokenizer)
377
+
378
+ def levenshtein(self, text1: str, text2: str) -> int:
379
+ """Calculate Levenshtein distance between two texts"""
380
+ return levenshtein_distance(text1, text2)
381
+
382
+ def levenshtein_similarity(self, text1: str, text2: str) -> float:
383
+ """Calculate normalized Levenshtein similarity"""
384
+ return normalized_levenshtein_similarity(text1, text2)
385
+
386
+ def fuzzy_match(
387
+ self,
388
+ query: str,
389
+ candidates: List[str],
390
+ threshold: float = 0.6,
391
+ method: str = "jaccard",
392
+ ) -> List[Tuple[str, float]]:
393
+ """Find fuzzy matches for a query"""
394
+ return fuzzy_match(query, candidates, threshold, method)
395
+
396
+ def bm25(self, corpus: List[str], k1: float = 1.5, b: float = 0.75) -> BM25Scorer:
397
+ """Create a BM25 scorer for a corpus"""
398
+ return BM25Scorer(corpus, k1=k1, b=b, tokenizer=self.tokenizer)
@@ -0,0 +1,15 @@
1
+ """
2
+ Knowledge Graph Traversal Application Layer
3
+
4
+ Advanced traversal algorithms and path ranking utilities.
5
+ """
6
+
7
+ from aiecs.application.knowledge_graph.traversal.path_scorer import PathScorer
8
+ from aiecs.application.knowledge_graph.traversal.enhanced_traversal import (
9
+ EnhancedTraversal,
10
+ )
11
+
12
+ __all__ = [
13
+ "PathScorer",
14
+ "EnhancedTraversal",
15
+ ]