cognee 0.5.0__py3-none-any.whl → 0.5.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. cognee/api/client.py +5 -1
  2. cognee/api/v1/add/add.py +1 -2
  3. cognee/api/v1/cognify/code_graph_pipeline.py +119 -0
  4. cognee/api/v1/cognify/cognify.py +16 -24
  5. cognee/api/v1/cognify/routers/__init__.py +1 -0
  6. cognee/api/v1/cognify/routers/get_code_pipeline_router.py +90 -0
  7. cognee/api/v1/cognify/routers/get_cognify_router.py +1 -3
  8. cognee/api/v1/datasets/routers/get_datasets_router.py +3 -3
  9. cognee/api/v1/ontologies/ontologies.py +37 -12
  10. cognee/api/v1/ontologies/routers/get_ontology_router.py +25 -27
  11. cognee/api/v1/search/search.py +0 -4
  12. cognee/api/v1/ui/ui.py +68 -38
  13. cognee/context_global_variables.py +16 -61
  14. cognee/eval_framework/answer_generation/answer_generation_executor.py +0 -10
  15. cognee/eval_framework/answer_generation/run_question_answering_module.py +1 -1
  16. cognee/eval_framework/corpus_builder/task_getters/get_cascade_graph_tasks.py +2 -0
  17. cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py +4 -4
  18. cognee/eval_framework/eval_config.py +2 -2
  19. cognee/eval_framework/modal_run_eval.py +28 -16
  20. cognee/infrastructure/databases/graph/config.py +0 -3
  21. cognee/infrastructure/databases/graph/get_graph_engine.py +0 -1
  22. cognee/infrastructure/databases/graph/graph_db_interface.py +0 -15
  23. cognee/infrastructure/databases/graph/kuzu/adapter.py +0 -228
  24. cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +1 -80
  25. cognee/infrastructure/databases/utils/__init__.py +0 -3
  26. cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +48 -62
  27. cognee/infrastructure/databases/vector/config.py +0 -2
  28. cognee/infrastructure/databases/vector/create_vector_engine.py +0 -1
  29. cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +6 -8
  30. cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +7 -9
  31. cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +10 -11
  32. cognee/infrastructure/databases/vector/embeddings/embedding_rate_limiter.py +544 -0
  33. cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +0 -2
  34. cognee/infrastructure/databases/vector/vector_db_interface.py +0 -35
  35. cognee/infrastructure/files/storage/s3_config.py +0 -2
  36. cognee/infrastructure/llm/LLMGateway.py +2 -5
  37. cognee/infrastructure/llm/config.py +0 -35
  38. cognee/infrastructure/llm/extraction/knowledge_graph/extract_content_graph.py +2 -2
  39. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/acreate_structured_output.py +8 -23
  40. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +16 -17
  41. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +37 -40
  42. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +36 -39
  43. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +1 -19
  44. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +9 -11
  45. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +21 -23
  46. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +34 -42
  47. cognee/modules/cognify/config.py +0 -2
  48. cognee/modules/data/deletion/prune_system.py +2 -52
  49. cognee/modules/data/methods/delete_dataset.py +0 -26
  50. cognee/modules/engine/models/__init__.py +0 -1
  51. cognee/modules/graph/cognee_graph/CogneeGraph.py +37 -85
  52. cognee/modules/graph/cognee_graph/CogneeGraphElements.py +3 -8
  53. cognee/modules/memify/memify.py +7 -1
  54. cognee/modules/pipelines/operations/pipeline.py +2 -18
  55. cognee/modules/retrieval/__init__.py +1 -1
  56. cognee/modules/retrieval/code_retriever.py +232 -0
  57. cognee/modules/retrieval/graph_completion_context_extension_retriever.py +0 -4
  58. cognee/modules/retrieval/graph_completion_cot_retriever.py +0 -4
  59. cognee/modules/retrieval/graph_completion_retriever.py +0 -10
  60. cognee/modules/retrieval/graph_summary_completion_retriever.py +0 -4
  61. cognee/modules/retrieval/temporal_retriever.py +0 -4
  62. cognee/modules/retrieval/utils/brute_force_triplet_search.py +10 -42
  63. cognee/modules/run_custom_pipeline/run_custom_pipeline.py +1 -8
  64. cognee/modules/search/methods/get_search_type_tools.py +8 -54
  65. cognee/modules/search/methods/no_access_control_search.py +0 -4
  66. cognee/modules/search/methods/search.py +0 -21
  67. cognee/modules/search/types/SearchType.py +1 -1
  68. cognee/modules/settings/get_settings.py +0 -19
  69. cognee/modules/users/methods/get_authenticated_user.py +2 -2
  70. cognee/modules/users/models/DatasetDatabase.py +3 -15
  71. cognee/shared/logging_utils.py +0 -4
  72. cognee/tasks/code/enrich_dependency_graph_checker.py +35 -0
  73. cognee/tasks/code/get_local_dependencies_checker.py +20 -0
  74. cognee/tasks/code/get_repo_dependency_graph_checker.py +35 -0
  75. cognee/tasks/documents/__init__.py +1 -0
  76. cognee/tasks/documents/check_permissions_on_dataset.py +26 -0
  77. cognee/tasks/graph/extract_graph_from_data.py +10 -9
  78. cognee/tasks/repo_processor/__init__.py +2 -0
  79. cognee/tasks/repo_processor/get_local_dependencies.py +335 -0
  80. cognee/tasks/repo_processor/get_non_code_files.py +158 -0
  81. cognee/tasks/repo_processor/get_repo_file_dependencies.py +243 -0
  82. cognee/tasks/storage/add_data_points.py +2 -142
  83. cognee/tests/test_cognee_server_start.py +4 -2
  84. cognee/tests/test_conversation_history.py +1 -23
  85. cognee/tests/test_delete_bmw_example.py +60 -0
  86. cognee/tests/test_search_db.py +1 -37
  87. cognee/tests/unit/api/test_ontology_endpoint.py +89 -77
  88. cognee/tests/unit/infrastructure/mock_embedding_engine.py +7 -3
  89. cognee/tests/unit/infrastructure/test_embedding_rate_limiting_realistic.py +5 -0
  90. cognee/tests/unit/modules/graph/cognee_graph_elements_test.py +2 -2
  91. cognee/tests/unit/modules/graph/cognee_graph_test.py +0 -406
  92. {cognee-0.5.0.dist-info → cognee-0.5.0.dev0.dist-info}/METADATA +89 -76
  93. {cognee-0.5.0.dist-info → cognee-0.5.0.dev0.dist-info}/RECORD +97 -118
  94. {cognee-0.5.0.dist-info → cognee-0.5.0.dev0.dist-info}/WHEEL +1 -1
  95. cognee/api/v1/ui/node_setup.py +0 -360
  96. cognee/api/v1/ui/npm_utils.py +0 -50
  97. cognee/eval_framework/Dockerfile +0 -29
  98. cognee/infrastructure/databases/dataset_database_handler/__init__.py +0 -3
  99. cognee/infrastructure/databases/dataset_database_handler/dataset_database_handler_interface.py +0 -80
  100. cognee/infrastructure/databases/dataset_database_handler/supported_dataset_database_handlers.py +0 -18
  101. cognee/infrastructure/databases/dataset_database_handler/use_dataset_database_handler.py +0 -10
  102. cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py +0 -81
  103. cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDevDatasetDatabaseHandler.py +0 -168
  104. cognee/infrastructure/databases/utils/get_graph_dataset_database_handler.py +0 -10
  105. cognee/infrastructure/databases/utils/get_vector_dataset_database_handler.py +0 -10
  106. cognee/infrastructure/databases/utils/resolve_dataset_database_connection_info.py +0 -30
  107. cognee/infrastructure/databases/vector/lancedb/LanceDBDatasetDatabaseHandler.py +0 -50
  108. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/bedrock/__init__.py +0 -5
  109. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/bedrock/adapter.py +0 -153
  110. cognee/memify_pipelines/create_triplet_embeddings.py +0 -53
  111. cognee/modules/engine/models/Triplet.py +0 -9
  112. cognee/modules/retrieval/register_retriever.py +0 -10
  113. cognee/modules/retrieval/registered_community_retrievers.py +0 -1
  114. cognee/modules/retrieval/triplet_retriever.py +0 -182
  115. cognee/shared/rate_limiting.py +0 -30
  116. cognee/tasks/memify/get_triplet_datapoints.py +0 -289
  117. cognee/tests/integration/retrieval/test_triplet_retriever.py +0 -84
  118. cognee/tests/integration/tasks/test_add_data_points.py +0 -139
  119. cognee/tests/integration/tasks/test_get_triplet_datapoints.py +0 -69
  120. cognee/tests/test_dataset_database_handler.py +0 -137
  121. cognee/tests/test_dataset_delete.py +0 -76
  122. cognee/tests/test_edge_centered_payload.py +0 -170
  123. cognee/tests/test_pipeline_cache.py +0 -164
  124. cognee/tests/unit/infrastructure/llm/test_llm_config.py +0 -46
  125. cognee/tests/unit/modules/memify_tasks/test_get_triplet_datapoints.py +0 -214
  126. cognee/tests/unit/modules/retrieval/test_brute_force_triplet_search.py +0 -608
  127. cognee/tests/unit/modules/retrieval/triplet_retriever_test.py +0 -83
  128. cognee/tests/unit/tasks/storage/test_add_data_points.py +0 -288
  129. {cognee-0.5.0.dist-info → cognee-0.5.0.dev0.dist-info}/entry_points.txt +0 -0
  130. {cognee-0.5.0.dist-info → cognee-0.5.0.dev0.dist-info}/licenses/LICENSE +0 -0
  131. {cognee-0.5.0.dist-info → cognee-0.5.0.dev0.dist-info}/licenses/NOTICE.md +0 -0
@@ -0,0 +1,20 @@
1
+ import argparse
2
+ import asyncio
3
+ from cognee.tasks.repo_processor.get_local_dependencies import get_local_script_dependencies
4
+
5
+ if __name__ == "__main__":
6
+ parser = argparse.ArgumentParser(description="Get local script dependencies.")
7
+
8
+ # Suggested path: .../cognee/examples/python/simple_example.py
9
+ parser.add_argument("script_path", type=str, help="Absolute path to the Python script file")
10
+
11
+ # Suggested path: .../cognee
12
+ parser.add_argument("repo_path", type=str, help="Absolute path to the repository root")
13
+
14
+ args = parser.parse_args()
15
+
16
+ dependencies = asyncio.run(get_local_script_dependencies(args.script_path, args.repo_path))
17
+
18
+ print("Dependencies:")
19
+ for dependency in dependencies:
20
+ print(dependency)
@@ -0,0 +1,35 @@
1
+ import os
2
+ import asyncio
3
+ import argparse
4
+ from cognee.tasks.repo_processor.get_repo_file_dependencies import get_repo_file_dependencies
5
+
6
+
7
+ def main():
8
+ """
9
+ Parse the command line arguments and print the repository file dependencies.
10
+
11
+ This function sets up an argument parser to retrieve the path of a repository. It checks
12
+ if the provided path exists and if it doesn’t, it prints an error message and exits. If
13
+ the path is valid, it calls an asynchronous function to get the dependencies and prints
14
+ the nodes and their relations in the dependency graph.
15
+ """
16
+ parser = argparse.ArgumentParser()
17
+ parser.add_argument("repo_path", help="Path to the repository")
18
+ args = parser.parse_args()
19
+
20
+ repo_path = args.repo_path
21
+ if not os.path.exists(repo_path):
22
+ print(f"Error: The provided repository path does not exist: {repo_path}")
23
+ return
24
+
25
+ graph = asyncio.run(get_repo_file_dependencies(repo_path))
26
+
27
+ for node in graph.nodes:
28
+ print(f"Node: {node}")
29
+ edges = graph.edges(node, data=True)
30
+ for _, target, data in edges:
31
+ print(f" Edge to {target}, Relation: {data.get('relation')}")
32
+
33
+
34
+ if __name__ == "__main__":
35
+ main()
@@ -1,2 +1,3 @@
1
1
  from .classify_documents import classify_documents
2
2
  from .extract_chunks_from_documents import extract_chunks_from_documents
3
+ from .check_permissions_on_dataset import check_permissions_on_dataset
@@ -0,0 +1,26 @@
1
+ from cognee.modules.data.processing.document_types import Document
2
+ from cognee.modules.users.permissions.methods import check_permission_on_dataset
3
+ from typing import List
4
+
5
+
6
+ async def check_permissions_on_dataset(
7
+ documents: List[Document], context: dict, user, permissions
8
+ ) -> List[Document]:
9
+ """
10
+ Validates a user's permissions on a list of documents.
11
+
12
+ Notes:
13
+ - This function assumes that `check_permission_on_documents` raises an exception if the permission check fails.
14
+ - It is designed to validate multiple permissions in a sequential manner for the same set of documents.
15
+ - Ensure that the `Document` and `user` objects conform to the expected structure and interfaces.
16
+ """
17
+
18
+ for permission in permissions:
19
+ await check_permission_on_dataset(
20
+ user,
21
+ permission,
22
+ # TODO: pass dataset through argument instead of context
23
+ context["dataset"].id,
24
+ )
25
+
26
+ return documents
@@ -2,7 +2,9 @@ import asyncio
2
2
  from typing import Type, List, Optional
3
3
  from pydantic import BaseModel
4
4
 
5
+ from cognee.infrastructure.databases.graph import get_graph_engine
5
6
  from cognee.modules.ontology.ontology_env_config import get_ontology_env_config
7
+ from cognee.tasks.storage import index_graph_edges
6
8
  from cognee.tasks.storage.add_data_points import add_data_points
7
9
  from cognee.modules.ontology.ontology_config import Config
8
10
  from cognee.modules.ontology.get_default_ontology_resolver import (
@@ -23,7 +25,6 @@ from cognee.tasks.graph.exceptions import (
23
25
  InvalidChunkGraphInputError,
24
26
  InvalidOntologyAdapterError,
25
27
  )
26
- from cognee.modules.cognify.config import get_cognify_config
27
28
 
28
29
 
29
30
  async def integrate_chunk_graphs(
@@ -66,6 +67,8 @@ async def integrate_chunk_graphs(
66
67
  type(ontology_resolver).__name__ if ontology_resolver else "None"
67
68
  )
68
69
 
70
+ graph_engine = await get_graph_engine()
71
+
69
72
  if graph_model is not KnowledgeGraph:
70
73
  for chunk_index, chunk_graph in enumerate(chunk_graphs):
71
74
  data_chunks[chunk_index].contains = chunk_graph
@@ -81,13 +84,12 @@ async def integrate_chunk_graphs(
81
84
  data_chunks, chunk_graphs, ontology_resolver, existing_edges_map
82
85
  )
83
86
 
84
- cognify_config = get_cognify_config()
85
- embed_triplets = cognify_config.triplet_embedding
86
-
87
87
  if len(graph_nodes) > 0:
88
- await add_data_points(
89
- data_points=graph_nodes, custom_edges=graph_edges, embed_triplets=embed_triplets
90
- )
88
+ await add_data_points(graph_nodes)
89
+
90
+ if len(graph_edges) > 0:
91
+ await graph_engine.add_edges(graph_edges)
92
+ await index_graph_edges(graph_edges)
91
93
 
92
94
  return data_chunks
93
95
 
@@ -97,7 +99,6 @@ async def extract_graph_from_data(
97
99
  graph_model: Type[BaseModel],
98
100
  config: Config = None,
99
101
  custom_prompt: Optional[str] = None,
100
- **kwargs,
101
102
  ) -> List[DocumentChunk]:
102
103
  """
103
104
  Extracts and integrates a knowledge graph from the text content of document chunks using a specified graph model.
@@ -112,7 +113,7 @@ async def extract_graph_from_data(
112
113
 
113
114
  chunk_graphs = await asyncio.gather(
114
115
  *[
115
- extract_content_graph(chunk.text, graph_model, custom_prompt=custom_prompt, **kwargs)
116
+ extract_content_graph(chunk.text, graph_model, custom_prompt=custom_prompt)
116
117
  for chunk in data_chunks
117
118
  ]
118
119
  )
@@ -0,0 +1,2 @@
1
+ from .get_non_code_files import get_non_py_files
2
+ from .get_repo_file_dependencies import get_repo_file_dependencies
@@ -0,0 +1,335 @@
1
+ import os
2
+ import aiofiles
3
+ import importlib
4
+ from typing import AsyncGenerator, Optional
5
+ from uuid import NAMESPACE_OID, uuid5
6
+ import tree_sitter_python as tspython
7
+ from tree_sitter import Language, Node, Parser, Tree
8
+ from cognee.shared.logging_utils import get_logger
9
+
10
+ from cognee.low_level import DataPoint
11
+ from cognee.shared.CodeGraphEntities import (
12
+ CodeFile,
13
+ ImportStatement,
14
+ FunctionDefinition,
15
+ ClassDefinition,
16
+ )
17
+
18
+ logger = get_logger()
19
+
20
+
21
+ class FileParser:
22
+ """
23
+ Handles the parsing of files into source code and an abstract syntax tree
24
+ representation. Public methods include:
25
+
26
+ - parse_file: Parses a file and returns its source code and syntax tree representation.
27
+ """
28
+
29
+ def __init__(self):
30
+ self.parsed_files = {}
31
+
32
+ async def parse_file(self, file_path: str) -> tuple[str, Tree]:
33
+ """
34
+ Parse a file and return its source code along with its syntax tree representation.
35
+
36
+ If the file has already been parsed, retrieve the result from memory instead of reading
37
+ the file again.
38
+
39
+ Parameters:
40
+ -----------
41
+
42
+ - file_path (str): The path of the file to parse.
43
+
44
+ Returns:
45
+ --------
46
+
47
+ - tuple[str, Tree]: A tuple containing the source code of the file and its
48
+ corresponding syntax tree representation.
49
+ """
50
+ PY_LANGUAGE = Language(tspython.language())
51
+ source_code_parser = Parser(PY_LANGUAGE)
52
+
53
+ if file_path not in self.parsed_files:
54
+ source_code = await get_source_code(file_path)
55
+ source_code_tree = source_code_parser.parse(bytes(source_code, "utf-8"))
56
+ self.parsed_files[file_path] = (source_code, source_code_tree)
57
+
58
+ return self.parsed_files[file_path]
59
+
60
+
61
+ async def get_source_code(file_path: str):
62
+ """
63
+ Read source code from a file asynchronously.
64
+
65
+ This function attempts to open a file specified by the given file path, read its
66
+ contents, and return the source code. In case of any errors during the file reading
67
+ process, it logs an error message and returns None.
68
+
69
+ Parameters:
70
+ -----------
71
+
72
+ - file_path (str): The path to the file from which to read the source code.
73
+
74
+ Returns:
75
+ --------
76
+
77
+ Returns the contents of the file as a string if successful, or None if an error
78
+ occurs.
79
+ """
80
+ try:
81
+ async with aiofiles.open(file_path, "r", encoding="utf-8") as f:
82
+ source_code = await f.read()
83
+ return source_code
84
+ except Exception as error:
85
+ logger.error(f"Error reading file {file_path}: {str(error)}")
86
+ return None
87
+
88
+
89
+ def resolve_module_path(module_name):
90
+ """
91
+ Find the file path of a module.
92
+
93
+ Return the file path of the specified module if found, or return None if the module does
94
+ not exist or cannot be located.
95
+
96
+ Parameters:
97
+ -----------
98
+
99
+ - module_name: The name of the module whose file path is to be resolved.
100
+
101
+ Returns:
102
+ --------
103
+
104
+ The file path of the module as a string or None if the module is not found.
105
+ """
106
+ try:
107
+ spec = importlib.util.find_spec(module_name)
108
+ if spec and spec.origin:
109
+ return spec.origin
110
+ except ModuleNotFoundError:
111
+ return None
112
+ return None
113
+
114
+
115
+ def find_function_location(
116
+ module_path: str, function_name: str, parser: FileParser
117
+ ) -> Optional[tuple[str, str]]:
118
+ """
119
+ Find the location of a function definition in a specified module.
120
+
121
+ Parameters:
122
+ -----------
123
+
124
+ - module_path (str): The path to the module where the function is defined.
125
+ - function_name (str): The name of the function whose location is to be found.
126
+ - parser (FileParser): An instance of FileParser used to parse the module's source
127
+ code.
128
+
129
+ Returns:
130
+ --------
131
+
132
+ - Optional[tuple[str, str]]: Returns a tuple containing the module path and the
133
+ start point of the function if found; otherwise, returns None.
134
+ """
135
+ if not module_path or not os.path.exists(module_path):
136
+ return None
137
+
138
+ source_code, tree = parser.parse_file(module_path)
139
+ root_node: Node = tree.root_node
140
+
141
+ for node in root_node.children:
142
+ if node.type == "function_definition":
143
+ func_name_node = node.child_by_field_name("name")
144
+
145
+ if func_name_node and func_name_node.text.decode() == function_name:
146
+ return (module_path, node.start_point) # (line, column)
147
+
148
+ return None
149
+
150
+
151
+ async def get_local_script_dependencies(
152
+ repo_path: str, script_path: str, detailed_extraction: bool = False
153
+ ) -> CodeFile:
154
+ """
155
+ Retrieve local script dependencies and create a CodeFile object.
156
+
157
+ Parameters:
158
+ -----------
159
+
160
+ - repo_path (str): The path to the repository that contains the script.
161
+ - script_path (str): The path of the script for which dependencies are being
162
+ extracted.
163
+ - detailed_extraction (bool): A flag indicating whether to perform a detailed
164
+ extraction of code components.
165
+
166
+ Returns:
167
+ --------
168
+
169
+ - CodeFile: Returns a CodeFile object containing information about the script,
170
+ including its dependencies and definitions.
171
+ """
172
+ code_file_parser = FileParser()
173
+ source_code, source_code_tree = await code_file_parser.parse_file(script_path)
174
+
175
+ file_path_relative_to_repo = script_path[len(repo_path) + 1 :]
176
+
177
+ if not detailed_extraction:
178
+ code_file_node = CodeFile(
179
+ id=uuid5(NAMESPACE_OID, script_path),
180
+ name=file_path_relative_to_repo,
181
+ source_code=source_code,
182
+ file_path=script_path,
183
+ language="python",
184
+ )
185
+ return code_file_node
186
+
187
+ code_file_node = CodeFile(
188
+ id=uuid5(NAMESPACE_OID, script_path),
189
+ name=file_path_relative_to_repo,
190
+ source_code=None,
191
+ file_path=script_path,
192
+ language="python",
193
+ )
194
+
195
+ async for part in extract_code_parts(source_code_tree.root_node, script_path=script_path):
196
+ part.file_path = script_path
197
+
198
+ if isinstance(part, FunctionDefinition):
199
+ code_file_node.provides_function_definition.append(part)
200
+ if isinstance(part, ClassDefinition):
201
+ code_file_node.provides_class_definition.append(part)
202
+ if isinstance(part, ImportStatement):
203
+ code_file_node.depends_on.append(part)
204
+
205
+ return code_file_node
206
+
207
+
208
+ def find_node(nodes: list[Node], condition: callable) -> Node:
209
+ """
210
+ Find and return the first node that satisfies the given condition.
211
+
212
+ Iterate through the provided list of nodes and return the first node for which the
213
+ condition callable returns True. If no such node is found, return None.
214
+
215
+ Parameters:
216
+ -----------
217
+
218
+ - nodes (list[Node]): A list of Node objects to search through.
219
+ - condition (callable): A callable that takes a Node and returns a boolean
220
+ indicating if the node meets specified criteria.
221
+
222
+ Returns:
223
+ --------
224
+
225
+ - Node: The first Node that matches the condition, or None if no such node exists.
226
+ """
227
+ for node in nodes:
228
+ if condition(node):
229
+ return node
230
+
231
+ return None
232
+
233
+
234
+ async def extract_code_parts(
235
+ tree_root: Node, script_path: str, existing_nodes: list[DataPoint] = {}
236
+ ) -> AsyncGenerator[DataPoint, None]:
237
+ """
238
+ Extract code parts from a given AST node tree asynchronously.
239
+
240
+ Iteratively yields DataPoint nodes representing import statements, function definitions,
241
+ and class definitions found in the children of the specified tree root. The function
242
+ checks
243
+ if nodes are already present in the existing_nodes dictionary to prevent duplicates.
244
+ This function has to be used in an asynchronous context, and it requires a valid
245
+ tree_root
246
+ and proper initialization of existing_nodes.
247
+
248
+ Parameters:
249
+ -----------
250
+
251
+ - tree_root (Node): The root node of the AST tree containing code parts to extract.
252
+ - script_path (str): The file path of the script from which the AST was generated.
253
+ - existing_nodes (list[DataPoint]): A dictionary that holds already extracted
254
+ DataPoint nodes to avoid duplicates. (default {})
255
+
256
+ Returns:
257
+ --------
258
+
259
+ Yields DataPoint nodes representing imported modules, functions, and classes.
260
+ """
261
+ for child_node in tree_root.children:
262
+ if child_node.type == "import_statement" or child_node.type == "import_from_statement":
263
+ parts = child_node.text.decode("utf-8").split()
264
+
265
+ if parts[0] == "import":
266
+ module_name = parts[1]
267
+ function_name = None
268
+ elif parts[0] == "from":
269
+ module_name = parts[1]
270
+ function_name = parts[3]
271
+
272
+ if " as " in function_name:
273
+ function_name = function_name.split(" as ")[0]
274
+
275
+ if " as " in module_name:
276
+ module_name = module_name.split(" as ")[0]
277
+
278
+ if function_name and "import " + function_name not in existing_nodes:
279
+ import_statement_node = ImportStatement(
280
+ name=function_name,
281
+ module=module_name,
282
+ start_point=child_node.start_point,
283
+ end_point=child_node.end_point,
284
+ file_path=script_path,
285
+ source_code=child_node.text,
286
+ )
287
+ existing_nodes["import " + function_name] = import_statement_node
288
+
289
+ if function_name:
290
+ yield existing_nodes["import " + function_name]
291
+
292
+ if module_name not in existing_nodes:
293
+ import_statement_node = ImportStatement(
294
+ name=module_name,
295
+ module=module_name,
296
+ start_point=child_node.start_point,
297
+ end_point=child_node.end_point,
298
+ file_path=script_path,
299
+ source_code=child_node.text,
300
+ )
301
+ existing_nodes[module_name] = import_statement_node
302
+
303
+ yield existing_nodes[module_name]
304
+
305
+ if child_node.type == "function_definition":
306
+ function_node = find_node(child_node.children, lambda node: node.type == "identifier")
307
+ function_node_name = function_node.text
308
+
309
+ if function_node_name not in existing_nodes:
310
+ function_definition_node = FunctionDefinition(
311
+ name=function_node_name,
312
+ start_point=child_node.start_point,
313
+ end_point=child_node.end_point,
314
+ file_path=script_path,
315
+ source_code=child_node.text,
316
+ )
317
+ existing_nodes[function_node_name] = function_definition_node
318
+
319
+ yield existing_nodes[function_node_name]
320
+
321
+ if child_node.type == "class_definition":
322
+ class_name_node = find_node(child_node.children, lambda node: node.type == "identifier")
323
+ class_name_node_name = class_name_node.text
324
+
325
+ if class_name_node_name not in existing_nodes:
326
+ class_definition_node = ClassDefinition(
327
+ name=class_name_node_name,
328
+ start_point=child_node.start_point,
329
+ end_point=child_node.end_point,
330
+ file_path=script_path,
331
+ source_code=child_node.text,
332
+ )
333
+ existing_nodes[class_name_node_name] = class_definition_node
334
+
335
+ yield existing_nodes[class_name_node_name]
@@ -0,0 +1,158 @@
1
+ import os
2
+
3
+
4
+ async def get_non_py_files(repo_path):
5
+ """
6
+ Get files that are not .py files and their contents.
7
+
8
+ Check if the specified repository path exists and if so, traverse the directory,
9
+ collecting the paths of files that do not have a .py extension and meet the
10
+ criteria set in the allowed and ignored patterns. Return a list of paths to
11
+ those files.
12
+
13
+ Parameters:
14
+ -----------
15
+
16
+ - repo_path: The file system path to the repository to scan for non-Python files.
17
+
18
+ Returns:
19
+ --------
20
+
21
+ A list of file paths that are not Python files and meet the specified criteria.
22
+ """
23
+ if not os.path.exists(repo_path):
24
+ return {}
25
+
26
+ IGNORED_PATTERNS = {
27
+ ".git",
28
+ "__pycache__",
29
+ "*.pyc",
30
+ "*.pyo",
31
+ "*.pyd",
32
+ "node_modules",
33
+ "*.egg-info",
34
+ }
35
+
36
+ ALLOWED_EXTENSIONS = {
37
+ ".txt",
38
+ ".md",
39
+ ".csv",
40
+ ".json",
41
+ ".xml",
42
+ ".yaml",
43
+ ".yml",
44
+ ".html",
45
+ ".css",
46
+ ".js",
47
+ ".ts",
48
+ ".jsx",
49
+ ".tsx",
50
+ ".sql",
51
+ ".log",
52
+ ".ini",
53
+ ".toml",
54
+ ".properties",
55
+ ".sh",
56
+ ".bash",
57
+ ".dockerfile",
58
+ ".gitignore",
59
+ ".gitattributes",
60
+ ".makefile",
61
+ ".pyproject",
62
+ ".requirements",
63
+ ".env",
64
+ ".pdf",
65
+ ".doc",
66
+ ".docx",
67
+ ".dot",
68
+ ".dotx",
69
+ ".rtf",
70
+ ".wps",
71
+ ".wpd",
72
+ ".odt",
73
+ ".ott",
74
+ ".ottx",
75
+ ".txt",
76
+ ".wp",
77
+ ".sdw",
78
+ ".sdx",
79
+ ".docm",
80
+ ".dotm",
81
+ # Additional extensions for other programming languages
82
+ ".java",
83
+ ".c",
84
+ ".cpp",
85
+ ".h",
86
+ ".cs",
87
+ ".go",
88
+ ".php",
89
+ ".rb",
90
+ ".swift",
91
+ ".pl",
92
+ ".lua",
93
+ ".rs",
94
+ ".scala",
95
+ ".kt",
96
+ ".sh",
97
+ ".sql",
98
+ ".v",
99
+ ".asm",
100
+ ".pas",
101
+ ".d",
102
+ ".ml",
103
+ ".clj",
104
+ ".cljs",
105
+ ".erl",
106
+ ".ex",
107
+ ".exs",
108
+ ".f",
109
+ ".fs",
110
+ ".r",
111
+ ".pyi",
112
+ ".pdb",
113
+ ".ipynb",
114
+ ".rmd",
115
+ ".cabal",
116
+ ".hs",
117
+ ".nim",
118
+ ".vhdl",
119
+ ".verilog",
120
+ ".svelte",
121
+ ".html",
122
+ ".css",
123
+ ".scss",
124
+ ".less",
125
+ ".json5",
126
+ ".yaml",
127
+ ".yml",
128
+ }
129
+
130
+ def should_process(path):
131
+ """
132
+ Determine if a file should be processed based on its extension and path patterns.
133
+
134
+ This function checks if the file extension is in the allowed list and ensures that none
135
+ of the ignored patterns are present in the provided file path.
136
+
137
+ Parameters:
138
+ -----------
139
+
140
+ - path: The file path to check for processing eligibility.
141
+
142
+ Returns:
143
+ --------
144
+
145
+ Returns True if the file should be processed; otherwise, False.
146
+ """
147
+ _, ext = os.path.splitext(path)
148
+ return ext in ALLOWED_EXTENSIONS and not any(
149
+ pattern in path for pattern in IGNORED_PATTERNS
150
+ )
151
+
152
+ non_py_files_paths = [
153
+ os.path.join(root, file)
154
+ for root, _, files in os.walk(repo_path)
155
+ for file in files
156
+ if not file.endswith(".py") and should_process(os.path.join(root, file))
157
+ ]
158
+ return non_py_files_paths