khoj 2.0.0b12.dev5__py3-none-any.whl → 2.0.0b13.dev5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. khoj/configure.py +21 -54
  2. khoj/database/adapters/__init__.py +2 -11
  3. khoj/database/migrations/0092_alter_chatmodel_model_type_alter_chatmodel_name_and_more.py +36 -0
  4. khoj/database/migrations/0093_remove_localorgconfig_user_and_more.py +36 -0
  5. khoj/database/models/__init__.py +4 -34
  6. khoj/interface/compiled/404/index.html +2 -2
  7. khoj/interface/compiled/_next/static/chunks/app/agents/layout-4e2a134ec26aa606.js +1 -0
  8. khoj/interface/compiled/_next/static/chunks/app/chat/layout-ad4d1792ab1a4108.js +1 -0
  9. khoj/interface/compiled/_next/static/chunks/{webpack-338a5000c912cc94.js → webpack-ee14d29b64c5ab47.js} +1 -1
  10. khoj/interface/compiled/_next/static/css/{c34713c98384ee87.css → 2945c4a857922f3b.css} +1 -1
  11. khoj/interface/compiled/agents/index.html +2 -2
  12. khoj/interface/compiled/agents/index.txt +2 -2
  13. khoj/interface/compiled/automations/index.html +2 -2
  14. khoj/interface/compiled/automations/index.txt +3 -3
  15. khoj/interface/compiled/chat/index.html +2 -2
  16. khoj/interface/compiled/chat/index.txt +2 -2
  17. khoj/interface/compiled/index.html +2 -2
  18. khoj/interface/compiled/index.txt +2 -2
  19. khoj/interface/compiled/search/index.html +2 -2
  20. khoj/interface/compiled/search/index.txt +2 -2
  21. khoj/interface/compiled/settings/index.html +2 -2
  22. khoj/interface/compiled/settings/index.txt +4 -4
  23. khoj/interface/compiled/share/chat/index.html +2 -2
  24. khoj/interface/compiled/share/chat/index.txt +2 -2
  25. khoj/main.py +4 -6
  26. khoj/processor/content/github/github_to_entries.py +0 -1
  27. khoj/processor/content/notion/notion_to_entries.py +0 -1
  28. khoj/processor/content/text_to_entries.py +0 -1
  29. khoj/processor/conversation/prompts.py +0 -32
  30. khoj/processor/conversation/utils.py +8 -27
  31. khoj/processor/operator/__init__.py +0 -1
  32. khoj/routers/api.py +2 -14
  33. khoj/routers/api_content.py +3 -111
  34. khoj/routers/helpers.py +9 -79
  35. khoj/utils/cli.py +5 -53
  36. khoj/utils/config.py +0 -65
  37. khoj/utils/constants.py +0 -7
  38. khoj/utils/helpers.py +1 -9
  39. khoj/utils/initialization.py +6 -45
  40. khoj/utils/rawconfig.py +0 -67
  41. khoj/utils/state.py +1 -7
  42. khoj/utils/yaml.py +0 -39
  43. {khoj-2.0.0b12.dev5.dist-info → khoj-2.0.0b13.dev5.dist-info}/METADATA +1 -2
  44. {khoj-2.0.0b12.dev5.dist-info → khoj-2.0.0b13.dev5.dist-info}/RECORD +56 -67
  45. khoj/interface/compiled/_next/static/chunks/app/agents/layout-e00fb81dca656a10.js +0 -1
  46. khoj/interface/compiled/_next/static/chunks/app/chat/layout-33934fc2d6ae6838.js +0 -1
  47. khoj/migrations/__init__.py +0 -0
  48. khoj/migrations/migrate_offline_chat_default_model.py +0 -69
  49. khoj/migrations/migrate_offline_chat_default_model_2.py +0 -71
  50. khoj/migrations/migrate_offline_chat_schema.py +0 -83
  51. khoj/migrations/migrate_offline_model.py +0 -29
  52. khoj/migrations/migrate_processor_config_openai.py +0 -67
  53. khoj/migrations/migrate_server_pg.py +0 -132
  54. khoj/migrations/migrate_version.py +0 -17
  55. khoj/processor/conversation/offline/__init__.py +0 -0
  56. khoj/processor/conversation/offline/chat_model.py +0 -224
  57. khoj/processor/conversation/offline/utils.py +0 -80
  58. khoj/processor/conversation/offline/whisper.py +0 -15
  59. khoj/utils/fs_syncer.py +0 -252
  60. /khoj/interface/compiled/_next/static/{7GoMcE8WpP9fbfYZXv4Nv → XfWrWDAk5VXeZ88OdP652}/_buildManifest.js +0 -0
  61. /khoj/interface/compiled/_next/static/{7GoMcE8WpP9fbfYZXv4Nv → XfWrWDAk5VXeZ88OdP652}/_ssgManifest.js +0 -0
  62. /khoj/interface/compiled/_next/static/chunks/{1327-1a9107b9a2a04a98.js → 1327-3b1a41af530fa8ee.js} +0 -0
  63. /khoj/interface/compiled/_next/static/chunks/{1915-5c6508f6ebb62a30.js → 1915-fbfe167c84ad60c5.js} +0 -0
  64. /khoj/interface/compiled/_next/static/chunks/{2117-080746c8e170c81a.js → 2117-e78b6902ad6f75ec.js} +0 -0
  65. /khoj/interface/compiled/_next/static/chunks/{2939-4af3fd24b8ffc9ad.js → 2939-4d4084c5b888b960.js} +0 -0
  66. /khoj/interface/compiled/_next/static/chunks/{4447-cd95608f8e93e711.js → 4447-d6cf93724d57e34b.js} +0 -0
  67. /khoj/interface/compiled/_next/static/chunks/{8667-50b03a89e82e0ba7.js → 8667-4b7790573b08c50d.js} +0 -0
  68. /khoj/interface/compiled/_next/static/chunks/{9139-8ac4d9feb10f8869.js → 9139-ce1ae935dac9c871.js} +0 -0
  69. {khoj-2.0.0b12.dev5.dist-info → khoj-2.0.0b13.dev5.dist-info}/WHEEL +0 -0
  70. {khoj-2.0.0b12.dev5.dist-info → khoj-2.0.0b13.dev5.dist-info}/entry_points.txt +0 -0
  71. {khoj-2.0.0b12.dev5.dist-info → khoj-2.0.0b13.dev5.dist-info}/licenses/LICENSE +0 -0
@@ -1,224 +0,0 @@
1
- import asyncio
2
- import logging
3
- import os
4
- from datetime import datetime
5
- from threading import Thread
6
- from time import perf_counter
7
- from typing import Any, AsyncGenerator, Dict, List, Union
8
-
9
- from langchain_core.messages.chat import ChatMessage
10
- from llama_cpp import Llama
11
-
12
- from khoj.database.models import Agent, ChatMessageModel, ChatModel
13
- from khoj.processor.conversation import prompts
14
- from khoj.processor.conversation.offline.utils import download_model
15
- from khoj.processor.conversation.utils import (
16
- ResponseWithThought,
17
- commit_conversation_trace,
18
- generate_chatml_messages_with_context,
19
- messages_to_print,
20
- )
21
- from khoj.utils import state
22
- from khoj.utils.helpers import (
23
- is_none_or_empty,
24
- is_promptrace_enabled,
25
- truncate_code_context,
26
- )
27
- from khoj.utils.rawconfig import FileAttachment, LocationData
28
- from khoj.utils.yaml import yaml_dump
29
-
30
- logger = logging.getLogger(__name__)
31
-
32
-
33
- async def converse_offline(
34
- # Query
35
- user_query: str,
36
- # Context
37
- references: list[dict] = [],
38
- online_results={},
39
- code_results={},
40
- query_files: str = None,
41
- generated_files: List[FileAttachment] = None,
42
- additional_context: List[str] = None,
43
- generated_asset_results: Dict[str, Dict] = {},
44
- location_data: LocationData = None,
45
- user_name: str = None,
46
- chat_history: list[ChatMessageModel] = [],
47
- # Model
48
- model_name: str = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
49
- loaded_model: Union[Any, None] = None,
50
- max_prompt_size=None,
51
- tokenizer_name=None,
52
- agent: Agent = None,
53
- tracer: dict = {},
54
- ) -> AsyncGenerator[ResponseWithThought, None]:
55
- """
56
- Converse with user using Llama (Async Version)
57
- """
58
- # Initialize Variables
59
- assert loaded_model is None or isinstance(loaded_model, Llama), "loaded_model must be of type Llama, if configured"
60
- offline_chat_model = loaded_model or download_model(model_name, max_tokens=max_prompt_size)
61
- tracer["chat_model"] = model_name
62
- current_date = datetime.now()
63
-
64
- if agent and agent.personality:
65
- system_prompt = prompts.custom_system_prompt_offline_chat.format(
66
- name=agent.name,
67
- bio=agent.personality,
68
- current_date=current_date.strftime("%Y-%m-%d"),
69
- day_of_week=current_date.strftime("%A"),
70
- )
71
- else:
72
- system_prompt = prompts.system_prompt_offline_chat.format(
73
- current_date=current_date.strftime("%Y-%m-%d"),
74
- day_of_week=current_date.strftime("%A"),
75
- )
76
-
77
- if location_data:
78
- location_prompt = prompts.user_location.format(location=f"{location_data}")
79
- system_prompt = f"{system_prompt}\n{location_prompt}"
80
-
81
- if user_name:
82
- user_name_prompt = prompts.user_name.format(name=user_name)
83
- system_prompt = f"{system_prompt}\n{user_name_prompt}"
84
-
85
- # Get Conversation Primer appropriate to Conversation Type
86
- context_message = ""
87
- if not is_none_or_empty(references):
88
- context_message = f"{prompts.notes_conversation_offline.format(references=yaml_dump(references))}\n\n"
89
- if not is_none_or_empty(online_results):
90
- simplified_online_results = online_results.copy()
91
- for result in online_results:
92
- if online_results[result].get("webpages"):
93
- simplified_online_results[result] = online_results[result]["webpages"]
94
-
95
- context_message += f"{prompts.online_search_conversation_offline.format(online_results=yaml_dump(simplified_online_results))}\n\n"
96
- if not is_none_or_empty(code_results):
97
- context_message += (
98
- f"{prompts.code_executed_context.format(code_results=truncate_code_context(code_results))}\n\n"
99
- )
100
- context_message = context_message.strip()
101
-
102
- # Setup Prompt with Primer or Conversation History
103
- messages = generate_chatml_messages_with_context(
104
- user_query,
105
- system_prompt,
106
- chat_history,
107
- context_message=context_message,
108
- model_name=model_name,
109
- loaded_model=offline_chat_model,
110
- max_prompt_size=max_prompt_size,
111
- tokenizer_name=tokenizer_name,
112
- model_type=ChatModel.ModelType.OFFLINE,
113
- query_files=query_files,
114
- generated_files=generated_files,
115
- generated_asset_results=generated_asset_results,
116
- program_execution_context=additional_context,
117
- )
118
-
119
- logger.debug(f"Conversation Context for {model_name}: {messages_to_print(messages)}")
120
-
121
- # Use asyncio.Queue and a thread to bridge sync iterator
122
- queue: asyncio.Queue[ResponseWithThought] = asyncio.Queue()
123
- stop_phrases = ["<s>", "INST]", "Notes:"]
124
-
125
- def _sync_llm_thread():
126
- """Synchronous function to run in a separate thread."""
127
- aggregated_response = ""
128
- start_time = perf_counter()
129
- state.chat_lock.acquire()
130
- try:
131
- response_iterator = send_message_to_model_offline(
132
- messages,
133
- loaded_model=offline_chat_model,
134
- stop=stop_phrases,
135
- max_prompt_size=max_prompt_size,
136
- streaming=True,
137
- tracer=tracer,
138
- )
139
- for response in response_iterator:
140
- response_delta: str = response["choices"][0]["delta"].get("content", "")
141
- # Log the time taken to start response
142
- if aggregated_response == "" and response_delta != "":
143
- logger.info(f"First response took: {perf_counter() - start_time:.3f} seconds")
144
- # Handle response chunk
145
- aggregated_response += response_delta
146
- # Put chunk into the asyncio queue (non-blocking)
147
- try:
148
- queue.put_nowait(ResponseWithThought(text=response_delta))
149
- except asyncio.QueueFull:
150
- # Should not happen with default queue size unless consumer is very slow
151
- logger.warning("Asyncio queue full during offline LLM streaming.")
152
- # Potentially block here or handle differently if needed
153
- asyncio.run(queue.put(ResponseWithThought(text=response_delta)))
154
-
155
- # Log the time taken to stream the entire response
156
- logger.info(f"Chat streaming took: {perf_counter() - start_time:.3f} seconds")
157
-
158
- # Save conversation trace
159
- tracer["chat_model"] = model_name
160
- if is_promptrace_enabled():
161
- commit_conversation_trace(messages, aggregated_response, tracer)
162
-
163
- except Exception as e:
164
- logger.error(f"Error in offline LLM thread: {e}", exc_info=True)
165
- finally:
166
- state.chat_lock.release()
167
- # Signal end of stream
168
- queue.put_nowait(None)
169
-
170
- # Start the synchronous thread
171
- thread = Thread(target=_sync_llm_thread)
172
- thread.start()
173
-
174
- # Asynchronously consume from the queue
175
- while True:
176
- chunk = await queue.get()
177
- if chunk is None: # End of stream signal
178
- queue.task_done()
179
- break
180
- yield chunk
181
- queue.task_done()
182
-
183
- # Wait for the thread to finish (optional, ensures cleanup)
184
- loop = asyncio.get_running_loop()
185
- await loop.run_in_executor(None, thread.join)
186
-
187
-
188
- def send_message_to_model_offline(
189
- messages: List[ChatMessage],
190
- loaded_model=None,
191
- model_name="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
192
- temperature: float = 0.2,
193
- streaming=False,
194
- stop=[],
195
- max_prompt_size: int = None,
196
- response_type: str = "text",
197
- tracer: dict = {},
198
- ):
199
- assert loaded_model is None or isinstance(loaded_model, Llama), "loaded_model must be of type Llama, if configured"
200
- offline_chat_model = loaded_model or download_model(model_name, max_tokens=max_prompt_size)
201
- messages_dict = [{"role": message.role, "content": message.content} for message in messages]
202
- seed = int(os.getenv("KHOJ_LLM_SEED")) if os.getenv("KHOJ_LLM_SEED") else None
203
- response = offline_chat_model.create_chat_completion(
204
- messages_dict,
205
- stop=stop,
206
- stream=streaming,
207
- temperature=temperature,
208
- response_format={"type": response_type},
209
- seed=seed,
210
- )
211
-
212
- if streaming:
213
- return response
214
-
215
- response_text: str = response["choices"][0]["message"].get("content", "")
216
-
217
- # Save conversation trace for non-streaming responses
218
- # Streamed responses need to be saved by the calling function
219
- tracer["chat_model"] = model_name
220
- tracer["temperature"] = temperature
221
- if is_promptrace_enabled():
222
- commit_conversation_trace(messages, response_text, tracer)
223
-
224
- return ResponseWithThought(text=response_text)
@@ -1,80 +0,0 @@
1
- import glob
2
- import logging
3
- import math
4
- import os
5
- from typing import Any, Dict
6
-
7
- from huggingface_hub.constants import HF_HUB_CACHE
8
-
9
- from khoj.utils import state
10
- from khoj.utils.helpers import get_device_memory
11
-
12
- logger = logging.getLogger(__name__)
13
-
14
-
15
- def download_model(repo_id: str, filename: str = "*Q4_K_M.gguf", max_tokens: int = None):
16
- # Initialize Model Parameters
17
- # Use n_ctx=0 to get context size from the model
18
- kwargs: Dict[str, Any] = {"n_threads": 4, "n_ctx": 0, "verbose": False}
19
-
20
- # Decide whether to load model to GPU or CPU
21
- device = "gpu" if state.chat_on_gpu and state.device != "cpu" else "cpu"
22
- kwargs["n_gpu_layers"] = -1 if device == "gpu" else 0
23
-
24
- # Add chat format if known
25
- if "llama-3" in repo_id.lower():
26
- kwargs["chat_format"] = "llama-3"
27
- elif "gemma-2" in repo_id.lower():
28
- kwargs["chat_format"] = "gemma"
29
-
30
- # Check if the model is already downloaded
31
- model_path = load_model_from_cache(repo_id, filename)
32
- chat_model = None
33
- try:
34
- chat_model = load_model(model_path, repo_id, filename, kwargs)
35
- except:
36
- # Load model on CPU if GPU is not available
37
- kwargs["n_gpu_layers"], device = 0, "cpu"
38
- chat_model = load_model(model_path, repo_id, filename, kwargs)
39
-
40
- # Now load the model with context size set based on:
41
- # 1. context size supported by model and
42
- # 2. configured size or machine (V)RAM
43
- kwargs["n_ctx"] = infer_max_tokens(chat_model.n_ctx(), max_tokens)
44
- chat_model = load_model(model_path, repo_id, filename, kwargs)
45
-
46
- logger.debug(
47
- f"{'Loaded' if model_path else 'Downloaded'} chat model to {device.upper()} with {kwargs['n_ctx']} token context window."
48
- )
49
- return chat_model
50
-
51
-
52
- def load_model(model_path: str, repo_id: str, filename: str = "*Q4_K_M.gguf", kwargs: dict = {}):
53
- from llama_cpp.llama import Llama
54
-
55
- if model_path:
56
- return Llama(model_path, **kwargs)
57
- else:
58
- return Llama.from_pretrained(repo_id=repo_id, filename=filename, **kwargs)
59
-
60
-
61
- def load_model_from_cache(repo_id: str, filename: str, repo_type="models"):
62
- # Construct the path to the model file in the cache directory
63
- repo_org, repo_name = repo_id.split("/")
64
- object_id = "--".join([repo_type, repo_org, repo_name])
65
- model_path = os.path.sep.join([HF_HUB_CACHE, object_id, "snapshots", "**", filename])
66
-
67
- # Check if the model file exists
68
- paths = glob.glob(model_path)
69
- if paths:
70
- return paths[0]
71
- else:
72
- return None
73
-
74
-
75
- def infer_max_tokens(model_context_window: int, configured_max_tokens=None) -> int:
76
- """Infer max prompt size based on device memory and max context window supported by the model"""
77
- configured_max_tokens = math.inf if configured_max_tokens is None else configured_max_tokens
78
- vram_based_n_ctx = int(get_device_memory() / 1e6) # based on heuristic
79
- configured_max_tokens = configured_max_tokens or math.inf # do not use if set to None
80
- return min(configured_max_tokens, vram_based_n_ctx, model_context_window)
@@ -1,15 +0,0 @@
1
- import whisper
2
- from asgiref.sync import sync_to_async
3
-
4
- from khoj.utils import state
5
-
6
-
7
- async def transcribe_audio_offline(audio_filename: str, model: str) -> str:
8
- """
9
- Transcribe audio file offline using Whisper
10
- """
11
- # Send the audio data to the Whisper API
12
- if not state.whisper_model:
13
- state.whisper_model = whisper.load_model(model)
14
- response = await sync_to_async(state.whisper_model.transcribe)(audio_filename)
15
- return response["text"]
khoj/utils/fs_syncer.py DELETED
@@ -1,252 +0,0 @@
1
- import glob
2
- import logging
3
- import os
4
- from pathlib import Path
5
- from typing import Optional
6
-
7
- from bs4 import BeautifulSoup
8
- from magika import Magika
9
-
10
- from khoj.database.models import (
11
- KhojUser,
12
- LocalMarkdownConfig,
13
- LocalOrgConfig,
14
- LocalPdfConfig,
15
- LocalPlaintextConfig,
16
- )
17
- from khoj.utils.config import SearchType
18
- from khoj.utils.helpers import get_absolute_path, is_none_or_empty
19
- from khoj.utils.rawconfig import TextContentConfig
20
-
21
- logger = logging.getLogger(__name__)
22
- magika = Magika()
23
-
24
-
25
- def collect_files(user: KhojUser, search_type: Optional[SearchType] = SearchType.All) -> dict:
26
- files: dict[str, dict] = {"docx": {}, "image": {}}
27
-
28
- if search_type == SearchType.All or search_type == SearchType.Org:
29
- org_config = LocalOrgConfig.objects.filter(user=user).first()
30
- files["org"] = get_org_files(construct_config_from_db(org_config)) if org_config else {}
31
- if search_type == SearchType.All or search_type == SearchType.Markdown:
32
- markdown_config = LocalMarkdownConfig.objects.filter(user=user).first()
33
- files["markdown"] = get_markdown_files(construct_config_from_db(markdown_config)) if markdown_config else {}
34
- if search_type == SearchType.All or search_type == SearchType.Plaintext:
35
- plaintext_config = LocalPlaintextConfig.objects.filter(user=user).first()
36
- files["plaintext"] = get_plaintext_files(construct_config_from_db(plaintext_config)) if plaintext_config else {}
37
- if search_type == SearchType.All or search_type == SearchType.Pdf:
38
- pdf_config = LocalPdfConfig.objects.filter(user=user).first()
39
- files["pdf"] = get_pdf_files(construct_config_from_db(pdf_config)) if pdf_config else {}
40
- files["image"] = {}
41
- files["docx"] = {}
42
- return files
43
-
44
-
45
- def construct_config_from_db(db_config) -> TextContentConfig:
46
- return TextContentConfig(
47
- input_files=db_config.input_files,
48
- input_filter=db_config.input_filter,
49
- index_heading_entries=db_config.index_heading_entries,
50
- )
51
-
52
-
53
- def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
54
- def is_plaintextfile(file: str):
55
- "Check if file is plaintext file"
56
- # Check if file path exists
57
- content_group = magika.identify_path(Path(file)).output.group
58
- # Use file extension to decide plaintext if file content is not identifiable
59
- valid_text_file_extensions = ("txt", "md", "markdown", "org" "mbox", "rst", "html", "htm", "xml")
60
- return file.endswith(valid_text_file_extensions) or content_group in ["text", "code"]
61
-
62
- def extract_html_content(html_content: str):
63
- "Extract content from HTML"
64
- soup = BeautifulSoup(html_content, "html.parser")
65
- return soup.get_text(strip=True, separator="\n")
66
-
67
- # Extract required fields from config
68
- input_files, input_filters = (
69
- config.input_files,
70
- config.input_filter,
71
- )
72
-
73
- # Input Validation
74
- if is_none_or_empty(input_files) and is_none_or_empty(input_filters):
75
- logger.debug("At least one of input-files or input-file-filter is required to be specified")
76
- return {}
77
-
78
- # Get all plain text files to process
79
- absolute_plaintext_files, filtered_plaintext_files = set(), set()
80
- if input_files:
81
- absolute_plaintext_files = {get_absolute_path(jsonl_file) for jsonl_file in input_files}
82
- if input_filters:
83
- filtered_plaintext_files = {
84
- filtered_file
85
- for plaintext_file_filter in input_filters
86
- for filtered_file in glob.glob(get_absolute_path(plaintext_file_filter), recursive=True)
87
- if os.path.isfile(filtered_file)
88
- }
89
-
90
- all_target_files = sorted(absolute_plaintext_files | filtered_plaintext_files)
91
-
92
- files_with_no_plaintext_extensions = {
93
- target_files for target_files in all_target_files if not is_plaintextfile(target_files)
94
- }
95
- if any(files_with_no_plaintext_extensions):
96
- logger.warning(f"Skipping unsupported files from plaintext indexing: {files_with_no_plaintext_extensions}")
97
- all_target_files = list(set(all_target_files) - files_with_no_plaintext_extensions)
98
-
99
- logger.debug(f"Processing files: {all_target_files}")
100
-
101
- filename_to_content_map = {}
102
- for file in all_target_files:
103
- with open(file, "r", encoding="utf8") as f:
104
- try:
105
- plaintext_content = f.read()
106
- if file.endswith(("html", "htm", "xml")):
107
- plaintext_content = extract_html_content(plaintext_content)
108
- filename_to_content_map[file] = plaintext_content
109
- except Exception as e:
110
- logger.warning(f"Unable to read file: {file} as plaintext. Skipping file.")
111
- logger.warning(e, exc_info=True)
112
-
113
- return filename_to_content_map
114
-
115
-
116
- def get_org_files(config: TextContentConfig):
117
- # Extract required fields from config
118
- org_files, org_file_filters = (
119
- config.input_files,
120
- config.input_filter,
121
- )
122
-
123
- # Input Validation
124
- if is_none_or_empty(org_files) and is_none_or_empty(org_file_filters):
125
- logger.debug("At least one of org-files or org-file-filter is required to be specified")
126
- return {}
127
-
128
- # Get Org files to process
129
- absolute_org_files, filtered_org_files = set(), set()
130
- if org_files:
131
- absolute_org_files = {get_absolute_path(org_file) for org_file in org_files}
132
- if org_file_filters:
133
- filtered_org_files = {
134
- filtered_file
135
- for org_file_filter in org_file_filters
136
- for filtered_file in glob.glob(get_absolute_path(org_file_filter), recursive=True)
137
- if os.path.isfile(filtered_file)
138
- }
139
-
140
- all_org_files = sorted(absolute_org_files | filtered_org_files)
141
-
142
- files_with_non_org_extensions = {org_file for org_file in all_org_files if not org_file.endswith(".org")}
143
- if any(files_with_non_org_extensions):
144
- logger.warning(f"There maybe non org-mode files in the input set: {files_with_non_org_extensions}")
145
-
146
- logger.debug(f"Processing files: {all_org_files}")
147
-
148
- filename_to_content_map = {}
149
- for file in all_org_files:
150
- with open(file, "r", encoding="utf8") as f:
151
- try:
152
- filename_to_content_map[file] = f.read()
153
- except Exception as e:
154
- logger.warning(f"Unable to read file: {file} as org. Skipping file.")
155
- logger.warning(e, exc_info=True)
156
-
157
- return filename_to_content_map
158
-
159
-
160
- def get_markdown_files(config: TextContentConfig):
161
- # Extract required fields from config
162
- markdown_files, markdown_file_filters = (
163
- config.input_files,
164
- config.input_filter,
165
- )
166
-
167
- # Input Validation
168
- if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filters):
169
- logger.debug("At least one of markdown-files or markdown-file-filter is required to be specified")
170
- return {}
171
-
172
- # Get markdown files to process
173
- absolute_markdown_files, filtered_markdown_files = set(), set()
174
- if markdown_files:
175
- absolute_markdown_files = {get_absolute_path(markdown_file) for markdown_file in markdown_files}
176
-
177
- if markdown_file_filters:
178
- filtered_markdown_files = {
179
- filtered_file
180
- for markdown_file_filter in markdown_file_filters
181
- for filtered_file in glob.glob(get_absolute_path(markdown_file_filter), recursive=True)
182
- if os.path.isfile(filtered_file)
183
- }
184
-
185
- all_markdown_files = sorted(absolute_markdown_files | filtered_markdown_files)
186
-
187
- files_with_non_markdown_extensions = {
188
- md_file for md_file in all_markdown_files if not md_file.endswith(".md") and not md_file.endswith(".markdown")
189
- }
190
-
191
- if any(files_with_non_markdown_extensions):
192
- logger.warning(
193
- f"[Warning] There maybe non markdown-mode files in the input set: {files_with_non_markdown_extensions}"
194
- )
195
-
196
- logger.debug(f"Processing files: {all_markdown_files}")
197
-
198
- filename_to_content_map = {}
199
- for file in all_markdown_files:
200
- with open(file, "r", encoding="utf8") as f:
201
- try:
202
- filename_to_content_map[file] = f.read()
203
- except Exception as e:
204
- logger.warning(f"Unable to read file: {file} as markdown. Skipping file.")
205
- logger.warning(e, exc_info=True)
206
-
207
- return filename_to_content_map
208
-
209
-
210
- def get_pdf_files(config: TextContentConfig):
211
- # Extract required fields from config
212
- pdf_files, pdf_file_filters = (
213
- config.input_files,
214
- config.input_filter,
215
- )
216
-
217
- # Input Validation
218
- if is_none_or_empty(pdf_files) and is_none_or_empty(pdf_file_filters):
219
- logger.debug("At least one of pdf-files or pdf-file-filter is required to be specified")
220
- return {}
221
-
222
- # Get PDF files to process
223
- absolute_pdf_files, filtered_pdf_files = set(), set()
224
- if pdf_files:
225
- absolute_pdf_files = {get_absolute_path(pdf_file) for pdf_file in pdf_files}
226
- if pdf_file_filters:
227
- filtered_pdf_files = {
228
- filtered_file
229
- for pdf_file_filter in pdf_file_filters
230
- for filtered_file in glob.glob(get_absolute_path(pdf_file_filter), recursive=True)
231
- if os.path.isfile(filtered_file)
232
- }
233
-
234
- all_pdf_files = sorted(absolute_pdf_files | filtered_pdf_files)
235
-
236
- files_with_non_pdf_extensions = {pdf_file for pdf_file in all_pdf_files if not pdf_file.endswith(".pdf")}
237
-
238
- if any(files_with_non_pdf_extensions):
239
- logger.warning(f"[Warning] There maybe non pdf-mode files in the input set: {files_with_non_pdf_extensions}")
240
-
241
- logger.debug(f"Processing files: {all_pdf_files}")
242
-
243
- filename_to_content_map = {}
244
- for file in all_pdf_files:
245
- with open(file, "rb") as f:
246
- try:
247
- filename_to_content_map[file] = f.read()
248
- except Exception as e:
249
- logger.warning(f"Unable to read file: {file} as PDF. Skipping file.")
250
- logger.warning(e, exc_info=True)
251
-
252
- return filename_to_content_map