cognee 0.2.1.dev7__py3-none-any.whl → 0.2.2.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (223) hide show
  1. cognee/api/client.py +44 -4
  2. cognee/api/health.py +332 -0
  3. cognee/api/v1/add/add.py +5 -2
  4. cognee/api/v1/add/routers/get_add_router.py +3 -0
  5. cognee/api/v1/cognify/code_graph_pipeline.py +3 -1
  6. cognee/api/v1/cognify/cognify.py +8 -0
  7. cognee/api/v1/cognify/routers/get_cognify_router.py +8 -1
  8. cognee/api/v1/config/config.py +3 -1
  9. cognee/api/v1/datasets/routers/get_datasets_router.py +2 -8
  10. cognee/api/v1/delete/delete.py +16 -12
  11. cognee/api/v1/responses/routers/get_responses_router.py +3 -1
  12. cognee/api/v1/search/search.py +10 -0
  13. cognee/api/v1/settings/routers/get_settings_router.py +0 -2
  14. cognee/base_config.py +1 -0
  15. cognee/eval_framework/evaluation/direct_llm_eval_adapter.py +5 -6
  16. cognee/infrastructure/databases/graph/config.py +2 -0
  17. cognee/infrastructure/databases/graph/get_graph_engine.py +58 -12
  18. cognee/infrastructure/databases/graph/graph_db_interface.py +15 -10
  19. cognee/infrastructure/databases/graph/kuzu/adapter.py +43 -16
  20. cognee/infrastructure/databases/graph/kuzu/kuzu_migrate.py +281 -0
  21. cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +151 -77
  22. cognee/infrastructure/databases/graph/neptune_driver/__init__.py +15 -0
  23. cognee/infrastructure/databases/graph/neptune_driver/adapter.py +1427 -0
  24. cognee/infrastructure/databases/graph/neptune_driver/exceptions.py +115 -0
  25. cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +224 -0
  26. cognee/infrastructure/databases/graph/networkx/adapter.py +3 -3
  27. cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py +449 -0
  28. cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +11 -3
  29. cognee/infrastructure/databases/vector/chromadb/ChromaDBAdapter.py +8 -3
  30. cognee/infrastructure/databases/vector/create_vector_engine.py +31 -23
  31. cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +3 -1
  32. cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +21 -6
  33. cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +4 -3
  34. cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py +3 -1
  35. cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +22 -16
  36. cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +36 -34
  37. cognee/infrastructure/databases/vector/vector_db_interface.py +78 -7
  38. cognee/infrastructure/files/utils/get_data_file_path.py +39 -0
  39. cognee/infrastructure/files/utils/guess_file_type.py +2 -2
  40. cognee/infrastructure/files/utils/open_data_file.py +4 -23
  41. cognee/infrastructure/llm/LLMGateway.py +137 -0
  42. cognee/infrastructure/llm/__init__.py +14 -4
  43. cognee/infrastructure/llm/config.py +29 -1
  44. cognee/infrastructure/llm/prompts/answer_hotpot_question.txt +1 -1
  45. cognee/infrastructure/llm/prompts/answer_hotpot_using_cognee_search.txt +1 -1
  46. cognee/infrastructure/llm/prompts/answer_simple_question.txt +1 -1
  47. cognee/infrastructure/llm/prompts/answer_simple_question_restricted.txt +1 -1
  48. cognee/infrastructure/llm/prompts/categorize_categories.txt +1 -1
  49. cognee/infrastructure/llm/prompts/classify_content.txt +1 -1
  50. cognee/infrastructure/llm/prompts/context_for_question.txt +1 -1
  51. cognee/infrastructure/llm/prompts/graph_context_for_question.txt +1 -1
  52. cognee/infrastructure/llm/prompts/natural_language_retriever_system.txt +1 -1
  53. cognee/infrastructure/llm/prompts/patch_gen_instructions.txt +1 -1
  54. cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +130 -0
  55. cognee/infrastructure/llm/prompts/summarize_code.txt +2 -2
  56. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/__init__.py +57 -0
  57. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/async_client.py +533 -0
  58. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/config.py +94 -0
  59. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/globals.py +37 -0
  60. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/inlinedbaml.py +21 -0
  61. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/parser.py +131 -0
  62. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/runtime.py +266 -0
  63. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/stream_types.py +137 -0
  64. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/sync_client.py +550 -0
  65. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/tracing.py +26 -0
  66. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_builder.py +962 -0
  67. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_map.py +52 -0
  68. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/types.py +166 -0
  69. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_categories.baml +109 -0
  70. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_content_graph.baml +343 -0
  71. cognee/{modules/data → infrastructure/llm/structured_output_framework/baml/baml_src}/extraction/__init__.py +1 -0
  72. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/extract_summary.py +89 -0
  73. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/extract_content_graph.py +33 -0
  74. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/generators.baml +18 -0
  75. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/__init__.py +3 -0
  76. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/extract_categories.py +12 -0
  77. cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/extract_summary.py +16 -7
  78. cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/knowledge_graph/extract_content_graph.py +7 -6
  79. cognee/infrastructure/llm/{anthropic → structured_output_framework/litellm_instructor/llm/anthropic}/adapter.py +10 -4
  80. cognee/infrastructure/llm/{gemini → structured_output_framework/litellm_instructor/llm/gemini}/adapter.py +6 -5
  81. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/__init__.py +0 -0
  82. cognee/infrastructure/llm/{generic_llm_api → structured_output_framework/litellm_instructor/llm/generic_llm_api}/adapter.py +7 -3
  83. cognee/infrastructure/llm/{get_llm_client.py → structured_output_framework/litellm_instructor/llm/get_llm_client.py} +18 -6
  84. cognee/infrastructure/llm/{llm_interface.py → structured_output_framework/litellm_instructor/llm/llm_interface.py} +2 -2
  85. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/__init__.py +0 -0
  86. cognee/infrastructure/llm/{ollama → structured_output_framework/litellm_instructor/llm/ollama}/adapter.py +4 -2
  87. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/__init__.py +0 -0
  88. cognee/infrastructure/llm/{openai → structured_output_framework/litellm_instructor/llm/openai}/adapter.py +6 -4
  89. cognee/infrastructure/llm/{rate_limiter.py → structured_output_framework/litellm_instructor/llm/rate_limiter.py} +0 -5
  90. cognee/infrastructure/llm/tokenizer/Gemini/adapter.py +4 -2
  91. cognee/infrastructure/llm/tokenizer/TikToken/adapter.py +7 -3
  92. cognee/infrastructure/llm/tokenizer/__init__.py +4 -0
  93. cognee/infrastructure/llm/utils.py +3 -1
  94. cognee/infrastructure/loaders/LoaderEngine.py +156 -0
  95. cognee/infrastructure/loaders/LoaderInterface.py +73 -0
  96. cognee/infrastructure/loaders/__init__.py +18 -0
  97. cognee/infrastructure/loaders/core/__init__.py +7 -0
  98. cognee/infrastructure/loaders/core/audio_loader.py +98 -0
  99. cognee/infrastructure/loaders/core/image_loader.py +114 -0
  100. cognee/infrastructure/loaders/core/text_loader.py +90 -0
  101. cognee/infrastructure/loaders/create_loader_engine.py +32 -0
  102. cognee/infrastructure/loaders/external/__init__.py +22 -0
  103. cognee/infrastructure/loaders/external/pypdf_loader.py +96 -0
  104. cognee/infrastructure/loaders/external/unstructured_loader.py +127 -0
  105. cognee/infrastructure/loaders/get_loader_engine.py +18 -0
  106. cognee/infrastructure/loaders/supported_loaders.py +18 -0
  107. cognee/infrastructure/loaders/use_loader.py +21 -0
  108. cognee/infrastructure/loaders/utils/__init__.py +0 -0
  109. cognee/modules/data/methods/__init__.py +1 -0
  110. cognee/modules/data/methods/get_authorized_dataset.py +23 -0
  111. cognee/modules/data/models/Data.py +13 -3
  112. cognee/modules/data/processing/document_types/AudioDocument.py +2 -2
  113. cognee/modules/data/processing/document_types/ImageDocument.py +2 -2
  114. cognee/modules/data/processing/document_types/PdfDocument.py +4 -11
  115. cognee/modules/data/processing/document_types/UnstructuredDocument.py +2 -5
  116. cognee/modules/engine/utils/generate_edge_id.py +5 -0
  117. cognee/modules/graph/cognee_graph/CogneeGraph.py +45 -35
  118. cognee/modules/graph/methods/get_formatted_graph_data.py +8 -2
  119. cognee/modules/graph/utils/get_graph_from_model.py +93 -101
  120. cognee/modules/ingestion/data_types/TextData.py +8 -2
  121. cognee/modules/ingestion/save_data_to_file.py +1 -1
  122. cognee/modules/pipelines/exceptions/__init__.py +1 -0
  123. cognee/modules/pipelines/exceptions/exceptions.py +12 -0
  124. cognee/modules/pipelines/models/DataItemStatus.py +5 -0
  125. cognee/modules/pipelines/models/PipelineRunInfo.py +6 -0
  126. cognee/modules/pipelines/models/__init__.py +1 -0
  127. cognee/modules/pipelines/operations/pipeline.py +10 -2
  128. cognee/modules/pipelines/operations/run_tasks.py +252 -20
  129. cognee/modules/pipelines/operations/run_tasks_distributed.py +1 -1
  130. cognee/modules/retrieval/chunks_retriever.py +23 -1
  131. cognee/modules/retrieval/code_retriever.py +66 -9
  132. cognee/modules/retrieval/completion_retriever.py +11 -9
  133. cognee/modules/retrieval/context_providers/TripletSearchContextProvider.py +0 -2
  134. cognee/modules/retrieval/graph_completion_context_extension_retriever.py +0 -2
  135. cognee/modules/retrieval/graph_completion_cot_retriever.py +8 -9
  136. cognee/modules/retrieval/graph_completion_retriever.py +1 -1
  137. cognee/modules/retrieval/insights_retriever.py +4 -0
  138. cognee/modules/retrieval/natural_language_retriever.py +9 -15
  139. cognee/modules/retrieval/summaries_retriever.py +23 -1
  140. cognee/modules/retrieval/utils/brute_force_triplet_search.py +23 -4
  141. cognee/modules/retrieval/utils/completion.py +6 -9
  142. cognee/modules/retrieval/utils/description_to_codepart_search.py +2 -3
  143. cognee/modules/search/methods/search.py +5 -1
  144. cognee/modules/search/operations/__init__.py +1 -0
  145. cognee/modules/search/operations/select_search_type.py +42 -0
  146. cognee/modules/search/types/SearchType.py +1 -0
  147. cognee/modules/settings/get_settings.py +0 -8
  148. cognee/modules/settings/save_vector_db_config.py +1 -1
  149. cognee/shared/data_models.py +3 -1
  150. cognee/shared/logging_utils.py +0 -5
  151. cognee/tasks/chunk_naive_llm_classifier/chunk_naive_llm_classifier.py +2 -2
  152. cognee/tasks/documents/extract_chunks_from_documents.py +10 -12
  153. cognee/tasks/entity_completion/entity_extractors/llm_entity_extractor.py +4 -6
  154. cognee/tasks/graph/cascade_extract/utils/extract_content_nodes_and_relationship_names.py +4 -6
  155. cognee/tasks/graph/cascade_extract/utils/extract_edge_triplets.py +6 -7
  156. cognee/tasks/graph/cascade_extract/utils/extract_nodes.py +4 -7
  157. cognee/tasks/graph/extract_graph_from_code.py +3 -2
  158. cognee/tasks/graph/extract_graph_from_data.py +4 -3
  159. cognee/tasks/graph/infer_data_ontology.py +5 -6
  160. cognee/tasks/ingestion/data_item_to_text_file.py +79 -0
  161. cognee/tasks/ingestion/ingest_data.py +91 -61
  162. cognee/tasks/ingestion/resolve_data_directories.py +3 -0
  163. cognee/tasks/repo_processor/get_repo_file_dependencies.py +3 -0
  164. cognee/tasks/storage/index_data_points.py +1 -1
  165. cognee/tasks/storage/index_graph_edges.py +4 -1
  166. cognee/tasks/summarization/summarize_code.py +2 -3
  167. cognee/tasks/summarization/summarize_text.py +3 -2
  168. cognee/tests/test_cognee_server_start.py +12 -7
  169. cognee/tests/test_deduplication.py +2 -2
  170. cognee/tests/test_deletion.py +58 -17
  171. cognee/tests/test_graph_visualization_permissions.py +161 -0
  172. cognee/tests/test_neptune_analytics_graph.py +309 -0
  173. cognee/tests/test_neptune_analytics_hybrid.py +176 -0
  174. cognee/tests/{test_weaviate.py → test_neptune_analytics_vector.py} +86 -11
  175. cognee/tests/test_pgvector.py +5 -5
  176. cognee/tests/test_s3.py +1 -6
  177. cognee/tests/unit/infrastructure/databases/test_rate_limiter.py +11 -10
  178. cognee/tests/unit/infrastructure/databases/vector/__init__.py +0 -0
  179. cognee/tests/unit/infrastructure/mock_embedding_engine.py +1 -1
  180. cognee/tests/unit/infrastructure/test_embedding_rate_limiting_realistic.py +5 -5
  181. cognee/tests/unit/infrastructure/test_rate_limiting_realistic.py +6 -4
  182. cognee/tests/unit/infrastructure/test_rate_limiting_retry.py +1 -1
  183. cognee/tests/unit/interfaces/graph/get_graph_from_model_unit_test.py +61 -3
  184. cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py +84 -9
  185. cognee/tests/unit/modules/search/search_methods_test.py +55 -0
  186. {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/METADATA +13 -9
  187. {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/RECORD +203 -164
  188. cognee/infrastructure/databases/vector/pinecone/adapter.py +0 -8
  189. cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py +0 -514
  190. cognee/infrastructure/databases/vector/qdrant/__init__.py +0 -2
  191. cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py +0 -527
  192. cognee/infrastructure/databases/vector/weaviate_db/__init__.py +0 -1
  193. cognee/modules/data/extraction/extract_categories.py +0 -14
  194. cognee/tests/test_qdrant.py +0 -99
  195. distributed/Dockerfile +0 -34
  196. distributed/app.py +0 -4
  197. distributed/entrypoint.py +0 -71
  198. distributed/entrypoint.sh +0 -5
  199. distributed/modal_image.py +0 -11
  200. distributed/queues.py +0 -5
  201. distributed/tasks/queued_add_data_points.py +0 -13
  202. distributed/tasks/queued_add_edges.py +0 -13
  203. distributed/tasks/queued_add_nodes.py +0 -13
  204. distributed/test.py +0 -28
  205. distributed/utils.py +0 -19
  206. distributed/workers/data_point_saving_worker.py +0 -93
  207. distributed/workers/graph_saving_worker.py +0 -104
  208. /cognee/infrastructure/databases/{graph/memgraph → hybrid/neptune_analytics}/__init__.py +0 -0
  209. /cognee/infrastructure/{llm → databases/vector/embeddings}/embedding_rate_limiter.py +0 -0
  210. /cognee/infrastructure/{databases/vector/pinecone → llm/structured_output_framework}/__init__.py +0 -0
  211. /cognee/infrastructure/llm/{anthropic → structured_output_framework/baml/baml_src}/__init__.py +0 -0
  212. /cognee/infrastructure/llm/{gemini/__init__.py → structured_output_framework/baml/baml_src/extraction/extract_categories.py} +0 -0
  213. /cognee/infrastructure/llm/{generic_llm_api → structured_output_framework/baml/baml_src/extraction/knowledge_graph}/__init__.py +0 -0
  214. /cognee/infrastructure/llm/{ollama → structured_output_framework/litellm_instructor}/__init__.py +0 -0
  215. /cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/knowledge_graph/__init__.py +0 -0
  216. /cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/texts.json +0 -0
  217. /cognee/infrastructure/llm/{openai → structured_output_framework/litellm_instructor/llm}/__init__.py +0 -0
  218. {distributed → cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic}/__init__.py +0 -0
  219. {distributed/tasks → cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini}/__init__.py +0 -0
  220. /cognee/modules/data/{extraction/knowledge_graph → methods}/add_model_class_to_graph.py +0 -0
  221. {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/WHEEL +0 -0
  222. {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/licenses/LICENSE +0 -0
  223. {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/licenses/NOTICE.md +0 -0
@@ -1,527 +0,0 @@
1
- from typing import List, Optional
2
-
3
- from tenacity import retry, retry_if_exception, stop_after_attempt, wait_exponential
4
-
5
- from cognee.shared.logging_utils import get_logger
6
- from cognee.exceptions import InvalidValueError
7
- from cognee.infrastructure.engine import DataPoint
8
- from cognee.infrastructure.engine.utils import parse_id
9
- from cognee.infrastructure.databases.vector.exceptions import CollectionNotFoundError
10
-
11
- from distributed.utils import override_distributed
12
- from distributed.tasks.queued_add_data_points import queued_add_data_points
13
-
14
- from ..embeddings.EmbeddingEngine import EmbeddingEngine
15
- from ..models.ScoredResult import ScoredResult
16
- from ..vector_db_interface import VectorDBInterface
17
-
18
- logger = get_logger("WeaviateAdapter")
19
-
20
-
21
- def is_retryable_request(error):
22
- from weaviate.exceptions import UnexpectedStatusCodeException
23
- from requests.exceptions import RequestException
24
-
25
- if isinstance(error, UnexpectedStatusCodeException):
26
- # Retry on conflict, service unavailable, internal error
27
- return error.status_code in {409, 503, 500}
28
- if isinstance(error, RequestException):
29
- return True # Includes timeout, connection error, etc.
30
- return False
31
-
32
-
33
- class IndexSchema(DataPoint):
34
- """
35
- Define a schema for indexing data points with textual content.
36
-
37
- The IndexSchema class inherits from DataPoint and includes the following public
38
- attributes:
39
-
40
- - text: A string representing the main content of the data point.
41
- - metadata: A dictionary containing indexing information, specifically the fields to be
42
- indexed (in this case, the 'text' field).
43
- """
44
-
45
- text: str
46
-
47
- metadata: dict = {"index_fields": ["text"]}
48
-
49
-
50
- class WeaviateAdapter(VectorDBInterface):
51
- """
52
- Adapt the Weaviate vector database to an interface for managing collections and data
53
- points.
54
-
55
- Public methods:
56
- - get_client
57
- - embed_data
58
- - has_collection
59
- - create_collection
60
- - get_collection
61
- - create_data_points
62
- - create_vector_index
63
- - index_data_points
64
- - retrieve
65
- - search
66
- - batch_search
67
- - delete_data_points
68
- - prune
69
- """
70
-
71
- name = "Weaviate"
72
- url: str
73
- api_key: str
74
- embedding_engine: EmbeddingEngine = None
75
-
76
- def __init__(self, url: str, api_key: str, embedding_engine: EmbeddingEngine):
77
- import weaviate
78
- import weaviate.classes as wvc
79
-
80
- self.url = url
81
- self.api_key = api_key
82
-
83
- self.embedding_engine = embedding_engine
84
-
85
- self.client = weaviate.use_async_with_weaviate_cloud(
86
- cluster_url=url,
87
- auth_credentials=weaviate.auth.AuthApiKey(api_key),
88
- additional_config=wvc.init.AdditionalConfig(timeout=wvc.init.Timeout(init=30)),
89
- )
90
-
91
- async def get_client(self):
92
- """
93
- Establish a connection to the Weaviate client.
94
-
95
- Return the Weaviate client instance after connecting asynchronously.
96
-
97
- Returns:
98
- --------
99
-
100
- The Weaviate client instance.
101
- """
102
- await self.client.connect()
103
-
104
- return self.client
105
-
106
- async def embed_data(self, data: List[str]) -> List[float]:
107
- """
108
- Embed the given text data into vector representations.
109
-
110
- Given a list of strings, return their vector embeddings using the configured embedding
111
- engine.
112
-
113
- Parameters:
114
- -----------
115
-
116
- - data (List[str]): A list of strings to be embedded.
117
-
118
- Returns:
119
- --------
120
-
121
- - List[float]: A list of float vectors corresponding to the embedded text data.
122
- """
123
- return await self.embedding_engine.embed_text(data)
124
-
125
- async def has_collection(self, collection_name: str) -> bool:
126
- """
127
- Check if a collection exists in the Weaviate database.
128
-
129
- Return a boolean indicating the presence of the specified collection.
130
-
131
- Parameters:
132
- -----------
133
-
134
- - collection_name (str): The name of the collection to check.
135
-
136
- Returns:
137
- --------
138
-
139
- - bool: True if the collection exists, otherwise False.
140
- """
141
- client = await self.get_client()
142
- return await client.collections.exists(collection_name)
143
-
144
- @retry(
145
- retry=retry_if_exception(is_retryable_request),
146
- stop=stop_after_attempt(3),
147
- wait=wait_exponential(multiplier=2, min=1, max=6),
148
- )
149
- async def create_collection(
150
- self,
151
- collection_name: str,
152
- payload_schema=None,
153
- ):
154
- """
155
- Create a new collection in the Weaviate database if it does not already exist.
156
-
157
- The collection will be initialized with a default schema.
158
-
159
- Parameters:
160
- -----------
161
-
162
- - collection_name (str): The name of the new collection to be created.
163
- - payload_schema: Optional schema definition for the collection payload. (default
164
- None)
165
-
166
- Returns:
167
- --------
168
-
169
- The created collection's configuration, if a new collection was made, otherwise
170
- information about the existing collection.
171
- """
172
- import weaviate.classes.config as wvcc
173
-
174
- if not await self.has_collection(collection_name):
175
- client = await self.get_client()
176
- return await client.collections.create(
177
- name=collection_name,
178
- properties=[
179
- wvcc.Property(
180
- name="text", data_type=wvcc.DataType.TEXT, skip_vectorization=True
181
- )
182
- ],
183
- )
184
- else:
185
- return await self.get_collection(collection_name)
186
-
187
- async def get_collection(self, collection_name: str):
188
- """
189
- Retrieve a collection from the Weaviate database by its name.
190
-
191
- Raise a CollectionNotFoundError if the specified collection does not exist.
192
-
193
- Parameters:
194
- -----------
195
-
196
- - collection_name (str): The name of the collection to be retrieved.
197
-
198
- Returns:
199
- --------
200
-
201
- The requested collection object from the database.
202
- """
203
- if not await self.has_collection(collection_name):
204
- raise CollectionNotFoundError(f"Collection '{collection_name}' not found.")
205
-
206
- client = await self.get_client()
207
- return client.collections.get(collection_name)
208
-
209
- @retry(
210
- retry=retry_if_exception(is_retryable_request),
211
- stop=stop_after_attempt(3),
212
- wait=wait_exponential(multiplier=2, min=1, max=6),
213
- )
214
- @override_distributed(queued_add_data_points)
215
- async def create_data_points(self, collection_name: str, data_points: List[DataPoint]):
216
- """
217
- Create or update data points in the specified collection in the Weaviate database.
218
-
219
- Process the list of data points, embedding them and either inserting them or updating if
220
- they already exist.
221
-
222
- Parameters:
223
- -----------
224
-
225
- - collection_name (str): The name of the collection to add data points to.
226
- - data_points (List[DataPoint]): A list of DataPoint objects to be created or
227
- updated in the collection.
228
-
229
- Returns:
230
- --------
231
-
232
- Information about the inserted or updated data points in the collection.
233
- """
234
- from weaviate.classes.data import DataObject
235
-
236
- data_vectors = await self.embed_data(
237
- [DataPoint.get_embeddable_data(data_point) for data_point in data_points]
238
- )
239
-
240
- def convert_to_weaviate_data_points(data_point: DataPoint):
241
- """
242
- Transform a DataPoint object into a Weaviate DataObject format for insertion.
243
-
244
- Return a DataObject ready for use in Weaviate with the properties and vector included.
245
-
246
- Parameters:
247
- -----------
248
-
249
- - data_point (DataPoint): The DataPoint to convert into the Weaviate DataObject
250
- format.
251
-
252
- Returns:
253
- --------
254
-
255
- The corresponding Weaviate DataObject representing the data point.
256
- """
257
- vector = data_vectors[data_points.index(data_point)]
258
- properties = data_point.model_dump()
259
-
260
- if "id" in properties:
261
- properties["uuid"] = str(data_point.id)
262
- del properties["id"]
263
-
264
- return DataObject(uuid=data_point.id, properties=properties, vector=vector)
265
-
266
- data_points = [convert_to_weaviate_data_points(data_point) for data_point in data_points]
267
-
268
- collection = await self.get_collection(collection_name)
269
-
270
- try:
271
- if len(data_points) > 1:
272
- return await collection.data.insert_many(data_points)
273
- # with collection.batch.dynamic() as batch:
274
- # for data_point in data_points:
275
- # batch.add_object(
276
- # uuid=data_point.uuid,
277
- # vector=data_point.vector,
278
- # properties=data_point.properties,
279
- # references=data_point.references,
280
- # )
281
- else:
282
- data_point: DataObject = data_points[0]
283
- if await collection.data.exists(data_point.uuid):
284
- return await collection.data.update(
285
- uuid=data_point.uuid,
286
- vector=data_point.vector,
287
- properties=data_point.properties,
288
- references=data_point.references,
289
- )
290
- else:
291
- return await collection.data.insert(
292
- uuid=data_point.uuid,
293
- vector=data_point.vector,
294
- properties=data_point.properties,
295
- references=data_point.references,
296
- )
297
- except Exception as error:
298
- logger.error("Error creating data points: %s", str(error))
299
- raise error
300
-
301
- async def create_vector_index(self, index_name: str, index_property_name: str):
302
- """
303
- Create a vector index based on an index name and property name by creating a
304
- corresponding collection.
305
-
306
- Parameters:
307
- -----------
308
-
309
- - index_name (str): The name for the vector index.
310
- - index_property_name (str): The property name associated with the vector index.
311
-
312
- Returns:
313
- --------
314
-
315
- The created collection representing the vector index.
316
- """
317
- return await self.create_collection(f"{index_name}_{index_property_name}")
318
-
319
- async def index_data_points(
320
- self, index_name: str, index_property_name: str, data_points: list[DataPoint]
321
- ):
322
- """
323
- Index a list of data points by creating an associated vector index collection.
324
-
325
- Data points are transformed into embeddable data before being processed for indexing.
326
-
327
- Parameters:
328
- -----------
329
-
330
- - index_name (str): The index name under which to store the data points.
331
- - index_property_name (str): The associated property name for the index.
332
- - data_points (list[DataPoint]): A list of DataPoint objects to be indexed.
333
-
334
- Returns:
335
- --------
336
-
337
- Information about the operation of indexing the data points.
338
- """
339
- return await self.create_data_points(
340
- f"{index_name}_{index_property_name}",
341
- [
342
- IndexSchema(
343
- id=data_point.id,
344
- text=DataPoint.get_embeddable_data(data_point),
345
- )
346
- for data_point in data_points
347
- ],
348
- )
349
-
350
- async def retrieve(self, collection_name: str, data_point_ids: list[str]):
351
- """
352
- Fetch data points from a specified collection based on their IDs.
353
-
354
- Return data points wrapped in an object containing their properties after
355
- transformation.
356
-
357
- Parameters:
358
- -----------
359
-
360
- - collection_name (str): The name of the collection to retrieve data points from.
361
- - data_point_ids (list[str]): A list of IDs for the data points to retrieve.
362
-
363
- Returns:
364
- --------
365
-
366
- A list of objects representing the retrieved data points.
367
- """
368
- from weaviate.classes.query import Filter
369
-
370
- collection = await self.get_collection(collection_name)
371
- data_points = await collection.query.fetch_objects(
372
- filters=Filter.by_id().contains_any(data_point_ids)
373
- )
374
-
375
- for data_point in data_points.objects:
376
- data_point.payload = data_point.properties
377
- data_point.id = data_point.uuid
378
- del data_point.properties
379
-
380
- return data_points.objects
381
-
382
- async def search(
383
- self,
384
- collection_name: str,
385
- query_text: Optional[str] = None,
386
- query_vector: Optional[List[float]] = None,
387
- limit: int = 15,
388
- with_vector: bool = False,
389
- ):
390
- """
391
- Perform a search on a collection using either a text query or a vector query.
392
-
393
- Return scored results based on the search criteria provided. Raise InvalidValueError if
394
- no query is provided.
395
-
396
- Parameters:
397
- -----------
398
-
399
- - collection_name (str): The name of the collection to search within.
400
- - query_text (Optional[str]): Optional plain text query for searching. (default
401
- None)
402
- - query_vector (Optional[List[float]]): Optional vector representation for
403
- searching. (default None)
404
- - limit (int): The maximum number of results to return. (default 15)
405
- - with_vector (bool): Include vector information in the results. (default False)
406
-
407
- Returns:
408
- --------
409
-
410
- A list of scored results matching the search criteria.
411
- """
412
- import weaviate.classes as wvc
413
- import weaviate.exceptions
414
-
415
- if query_text is None and query_vector is None:
416
- raise InvalidValueError(message="One of query_text or query_vector must be provided!")
417
-
418
- if query_vector is None:
419
- query_vector = (await self.embed_data([query_text]))[0]
420
-
421
- collection = await self.get_collection(collection_name)
422
-
423
- try:
424
- search_result = await collection.query.hybrid(
425
- query=None,
426
- vector=query_vector,
427
- limit=limit if limit > 0 else None,
428
- include_vector=with_vector,
429
- return_metadata=wvc.query.MetadataQuery(score=True),
430
- )
431
-
432
- return [
433
- ScoredResult(
434
- id=parse_id(str(result.uuid)),
435
- payload=result.properties,
436
- score=1 - float(result.metadata.score),
437
- )
438
- for result in search_result.objects
439
- ]
440
- except weaviate.exceptions.WeaviateInvalidInputError:
441
- # Ignore if the collection doesn't exist
442
- return []
443
-
444
- async def batch_search(
445
- self, collection_name: str, query_texts: List[str], limit: int, with_vectors: bool = False
446
- ):
447
- """
448
- Execute a batch search for multiple query texts in the specified collection.
449
-
450
- Return a list of results for each query performed in parallel.
451
-
452
- Parameters:
453
- -----------
454
-
455
- - collection_name (str): The name of the collection to search within.
456
- - query_texts (List[str]): A list of text queries to be processed in a batch.
457
- - limit (int): The maximum number of results to return for each query.
458
- - with_vectors (bool): Indicate whether to include vector information in the
459
- results. (default False)
460
-
461
- Returns:
462
- --------
463
-
464
- A list containing results for each search query executed.
465
- """
466
-
467
- def query_search(query_vector):
468
- """
469
- Wrap the search operation based on a query vector for fetching results.
470
-
471
- This function coordinates the search call, ensuring the collection name and search
472
- parameters are applied.
473
-
474
- Parameters:
475
- -----------
476
-
477
- - query_vector: The vector representation of the query for searching.
478
-
479
- Returns:
480
- --------
481
-
482
- The results of the search operation on the specified collection.
483
- """
484
- return self.search(
485
- collection_name, query_vector=query_vector, limit=limit, with_vector=with_vectors
486
- )
487
-
488
- return [
489
- await query_search(query_vector) for query_vector in await self.embed_data(query_texts)
490
- ]
491
-
492
- async def delete_data_points(self, collection_name: str, data_point_ids: list[str]):
493
- """
494
- Remove specified data points from a collection based on their IDs.
495
-
496
- Return information about the deletion result, ideally confirming the operation's
497
- success.
498
-
499
- Parameters:
500
- -----------
501
-
502
- - collection_name (str): The name of the collection from which to delete data
503
- points.
504
- - data_point_ids (list[str]): A list of IDs for the data points to be deleted.
505
-
506
- Returns:
507
- --------
508
-
509
- Confirmation of deletion operation result.
510
- """
511
- from weaviate.classes.query import Filter
512
-
513
- collection = await self.get_collection(collection_name)
514
- result = await collection.data.delete_many(
515
- filters=Filter.by_id().contains_any(data_point_ids)
516
- )
517
-
518
- return result
519
-
520
- async def prune(self):
521
- """
522
- Delete all collections from the Weaviate database.
523
-
524
- This operation will remove all data and cannot be undone.
525
- """
526
- client = await self.get_client()
527
- await client.collections.delete_all()
@@ -1 +0,0 @@
1
- from .WeaviateAdapter import WeaviateAdapter
@@ -1,14 +0,0 @@
1
- from typing import Type, List
2
- from pydantic import BaseModel
3
- from cognee.infrastructure.llm.prompts import read_query_prompt
4
- from cognee.infrastructure.llm.get_llm_client import get_llm_client
5
-
6
-
7
- async def extract_categories(content: str, response_model: Type[BaseModel]):
8
- llm_client = get_llm_client()
9
-
10
- system_prompt = read_query_prompt("classify_content.txt")
11
-
12
- llm_output = await llm_client.acreate_structured_output(content, system_prompt, response_model)
13
-
14
- return llm_output
@@ -1,99 +0,0 @@
1
- import os
2
- import pathlib
3
- import cognee
4
- from cognee.infrastructure.files.storage import get_storage_config
5
- from cognee.modules.search.operations import get_history
6
- from cognee.modules.users.methods import get_default_user
7
- from cognee.shared.logging_utils import get_logger
8
- from cognee.modules.search.types import SearchType
9
-
10
- logger = get_logger()
11
-
12
-
13
- async def main():
14
- cognee.config.set_vector_db_provider("qdrant")
15
- data_directory_path = str(
16
- pathlib.Path(
17
- os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_qdrant")
18
- ).resolve()
19
- )
20
- cognee.config.data_root_directory(data_directory_path)
21
- cognee_directory_path = str(
22
- pathlib.Path(
23
- os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_qdrant")
24
- ).resolve()
25
- )
26
- cognee.config.system_root_directory(cognee_directory_path)
27
-
28
- await cognee.prune.prune_data()
29
- await cognee.prune.prune_system(metadata=True)
30
-
31
- dataset_name = "cs_explanations"
32
-
33
- explanation_file_path = os.path.join(
34
- pathlib.Path(__file__).parent, "test_data/Natural_language_processing.txt"
35
- )
36
- await cognee.add([explanation_file_path], dataset_name)
37
-
38
- text = """A quantum computer is a computer that takes advantage of quantum mechanical phenomena.
39
- At small scales, physical matter exhibits properties of both particles and waves, and quantum computing leverages this behavior, specifically quantum superposition and entanglement, using specialized hardware that supports the preparation and manipulation of quantum states.
40
- Classical physics cannot explain the operation of these quantum devices, and a scalable quantum computer could perform some calculations exponentially faster (with respect to input size scaling) than any modern "classical" computer. In particular, a large-scale quantum computer could break widely used encryption schemes and aid physicists in performing physical simulations; however, the current state of the technology is largely experimental and impractical, with several obstacles to useful applications. Moreover, scalable quantum computers do not hold promise for many practical tasks, and for many important tasks quantum speedups are proven impossible.
41
- The basic unit of information in quantum computing is the qubit, similar to the bit in traditional digital electronics. Unlike a classical bit, a qubit can exist in a superposition of its two "basis" states. When measuring a qubit, the result is a probabilistic output of a classical bit, therefore making quantum computers nondeterministic in general. If a quantum computer manipulates the qubit in a particular way, wave interference effects can amplify the desired measurement results. The design of quantum algorithms involves creating procedures that allow a quantum computer to perform calculations efficiently and quickly.
42
- Physically engineering high-quality qubits has proven challenging. If a physical qubit is not sufficiently isolated from its environment, it suffers from quantum decoherence, introducing noise into calculations. Paradoxically, perfectly isolating qubits is also undesirable because quantum computations typically need to initialize qubits, perform controlled qubit interactions, and measure the resulting quantum states. Each of those operations introduces errors and suffers from noise, and such inaccuracies accumulate.
43
- In principle, a non-quantum (classical) computer can solve the same computational problems as a quantum computer, given enough time. Quantum advantage comes in the form of time complexity rather than computability, and quantum complexity theory shows that some quantum algorithms for carefully selected tasks require exponentially fewer computational steps than the best known non-quantum algorithms. Such tasks can in theory be solved on a large-scale quantum computer whereas classical computers would not finish computations in any reasonable amount of time. However, quantum speedup is not universal or even typical across computational tasks, since basic tasks such as sorting are proven to not allow any asymptotic quantum speedup. Claims of quantum supremacy have drawn significant attention to the discipline, but are demonstrated on contrived tasks, while near-term practical use cases remain limited.
44
- """
45
-
46
- await cognee.add([text], dataset_name)
47
-
48
- await cognee.cognify([dataset_name])
49
-
50
- from cognee.infrastructure.databases.vector import get_vector_engine
51
-
52
- vector_engine = get_vector_engine()
53
- search_results = await vector_engine.search("Entity_name", "Quantum computer")
54
-
55
- assert len(search_results) != 0, "The search results list is empty."
56
-
57
- random_node = search_results[0]
58
- random_node_name = random_node.payload["text"]
59
-
60
- search_results = await cognee.search(
61
- query_type=SearchType.INSIGHTS, query_text=random_node_name
62
- )
63
- assert len(search_results) != 0, "The search results list is empty."
64
- print("\n\nExtracted sentences are:\n")
65
- for result in search_results:
66
- print(f"{result}\n")
67
-
68
- search_results = await cognee.search(query_type=SearchType.CHUNKS, query_text=random_node_name)
69
- assert len(search_results) != 0, "The search results list is empty."
70
- print("\n\nExtracted chunks are:\n")
71
- for result in search_results:
72
- print(f"{result}\n")
73
-
74
- search_results = await cognee.search(
75
- query_type=SearchType.SUMMARIES, query_text=random_node_name
76
- )
77
- assert len(search_results) != 0, "Query related summaries don't exist."
78
- print("\nExtracted summaries are:\n")
79
- for result in search_results:
80
- print(f"{result}\n")
81
-
82
- user = await get_default_user()
83
- history = await get_history(user.id)
84
- assert len(history) == 6, "Search history is not correct."
85
-
86
- await cognee.prune.prune_data()
87
- data_root_directory = get_storage_config()["data_root_directory"]
88
- assert not os.path.isdir(data_root_directory), "Local data files are not deleted"
89
-
90
- await cognee.prune.prune_system(metadata=True)
91
- qdrant_client = get_vector_engine().get_qdrant_client()
92
- collections_response = await qdrant_client.get_collections()
93
- assert len(collections_response.collections) == 0, "QDrant vector database is not empty"
94
-
95
-
96
- if __name__ == "__main__":
97
- import asyncio
98
-
99
- asyncio.run(main())
distributed/Dockerfile DELETED
@@ -1,34 +0,0 @@
1
- FROM python:3.11-slim
2
-
3
- # Set environment variables
4
- ENV PIP_NO_CACHE_DIR=true
5
- ENV PATH="${PATH}:/root/.poetry/bin"
6
- ENV PYTHONPATH=/app
7
- ENV RUN_MODE=modal
8
- ENV SKIP_MIGRATIONS=true
9
- ENV COGNEE_DISTRIBUTED=true
10
-
11
- # System dependencies
12
- RUN apt-get update && apt-get install -y \
13
- gcc \
14
- libpq-dev \
15
- git \
16
- curl \
17
- build-essential \
18
- && rm -rf /var/lib/apt/lists/*
19
-
20
- WORKDIR /app
21
-
22
- COPY pyproject.toml poetry.lock README.md /app/
23
-
24
- RUN pip install poetry
25
-
26
- RUN poetry config virtualenvs.create false
27
-
28
- RUN poetry install --extras neo4j --extras postgres --extras aws --extras distributed --no-root
29
-
30
- COPY cognee/ /app/cognee
31
- COPY distributed/ /app/distributed
32
- RUN chmod +x /app/distributed/entrypoint.sh
33
-
34
- ENTRYPOINT ["/app/distributed/entrypoint.sh"]
distributed/app.py DELETED
@@ -1,4 +0,0 @@
1
- from modal import App
2
-
3
-
4
- app = App("cognee_modal_distributed")