khoj 1.16.1.dev15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- khoj/__init__.py +0 -0
- khoj/app/README.md +94 -0
- khoj/app/__init__.py +0 -0
- khoj/app/asgi.py +16 -0
- khoj/app/settings.py +192 -0
- khoj/app/urls.py +25 -0
- khoj/configure.py +424 -0
- khoj/database/__init__.py +0 -0
- khoj/database/adapters/__init__.py +1234 -0
- khoj/database/admin.py +290 -0
- khoj/database/apps.py +6 -0
- khoj/database/management/__init__.py +0 -0
- khoj/database/management/commands/__init__.py +0 -0
- khoj/database/management/commands/change_generated_images_url.py +61 -0
- khoj/database/management/commands/convert_images_png_to_webp.py +99 -0
- khoj/database/migrations/0001_khojuser.py +98 -0
- khoj/database/migrations/0002_googleuser.py +32 -0
- khoj/database/migrations/0003_vector_extension.py +10 -0
- khoj/database/migrations/0004_content_types_and_more.py +181 -0
- khoj/database/migrations/0005_embeddings_corpus_id.py +19 -0
- khoj/database/migrations/0006_embeddingsdates.py +33 -0
- khoj/database/migrations/0007_add_conversation.py +27 -0
- khoj/database/migrations/0008_alter_conversation_conversation_log.py +17 -0
- khoj/database/migrations/0009_khojapiuser.py +24 -0
- khoj/database/migrations/0010_chatmodeloptions_and_more.py +83 -0
- khoj/database/migrations/0010_rename_embeddings_entry_and_more.py +30 -0
- khoj/database/migrations/0011_merge_20231102_0138.py +14 -0
- khoj/database/migrations/0012_entry_file_source.py +21 -0
- khoj/database/migrations/0013_subscription.py +37 -0
- khoj/database/migrations/0014_alter_googleuser_picture.py +17 -0
- khoj/database/migrations/0015_alter_subscription_user.py +21 -0
- khoj/database/migrations/0016_alter_subscription_renewal_date.py +17 -0
- khoj/database/migrations/0017_searchmodel.py +32 -0
- khoj/database/migrations/0018_searchmodelconfig_delete_searchmodel.py +30 -0
- khoj/database/migrations/0019_alter_googleuser_family_name_and_more.py +27 -0
- khoj/database/migrations/0020_reflectivequestion.py +36 -0
- khoj/database/migrations/0021_speechtotextmodeloptions_and_more.py +42 -0
- khoj/database/migrations/0022_texttoimagemodelconfig.py +25 -0
- khoj/database/migrations/0023_usersearchmodelconfig.py +33 -0
- khoj/database/migrations/0024_alter_entry_embeddings.py +18 -0
- khoj/database/migrations/0025_clientapplication_khojuser_phone_number_and_more.py +46 -0
- khoj/database/migrations/0025_searchmodelconfig_embeddings_inference_endpoint_and_more.py +22 -0
- khoj/database/migrations/0026_searchmodelconfig_cross_encoder_inference_endpoint_and_more.py +22 -0
- khoj/database/migrations/0027_merge_20240118_1324.py +13 -0
- khoj/database/migrations/0028_khojuser_verified_phone_number.py +17 -0
- khoj/database/migrations/0029_userrequests.py +27 -0
- khoj/database/migrations/0030_conversation_slug_and_title.py +38 -0
- khoj/database/migrations/0031_agent_conversation_agent.py +53 -0
- khoj/database/migrations/0031_alter_googleuser_locale.py +30 -0
- khoj/database/migrations/0032_merge_20240322_0427.py +14 -0
- khoj/database/migrations/0033_rename_tuning_agent_personality.py +17 -0
- khoj/database/migrations/0034_alter_chatmodeloptions_chat_model.py +32 -0
- khoj/database/migrations/0035_processlock.py +26 -0
- khoj/database/migrations/0036_alter_processlock_name.py +19 -0
- khoj/database/migrations/0036_delete_offlinechatprocessorconversationconfig.py +15 -0
- khoj/database/migrations/0036_publicconversation.py +42 -0
- khoj/database/migrations/0037_chatmodeloptions_openai_config_and_more.py +51 -0
- khoj/database/migrations/0037_searchmodelconfig_bi_encoder_docs_encode_config_and_more.py +32 -0
- khoj/database/migrations/0038_merge_20240425_0857.py +14 -0
- khoj/database/migrations/0038_merge_20240426_1640.py +12 -0
- khoj/database/migrations/0039_merge_20240501_0301.py +12 -0
- khoj/database/migrations/0040_alter_processlock_name.py +26 -0
- khoj/database/migrations/0040_merge_20240504_1010.py +14 -0
- khoj/database/migrations/0041_merge_20240505_1234.py +14 -0
- khoj/database/migrations/0042_serverchatsettings.py +46 -0
- khoj/database/migrations/0043_alter_chatmodeloptions_model_type.py +21 -0
- khoj/database/migrations/0044_conversation_file_filters.py +17 -0
- khoj/database/migrations/0045_fileobject.py +37 -0
- khoj/database/migrations/0046_khojuser_email_verification_code_and_more.py +22 -0
- khoj/database/migrations/0047_alter_entry_file_type.py +31 -0
- khoj/database/migrations/0048_voicemodeloption_uservoicemodelconfig.py +52 -0
- khoj/database/migrations/0049_datastore.py +38 -0
- khoj/database/migrations/0049_texttoimagemodelconfig_api_key_and_more.py +58 -0
- khoj/database/migrations/0050_alter_processlock_name.py +25 -0
- khoj/database/migrations/0051_merge_20240702_1220.py +14 -0
- khoj/database/migrations/0052_alter_searchmodelconfig_bi_encoder_docs_encode_config_and_more.py +27 -0
- khoj/database/migrations/__init__.py +0 -0
- khoj/database/models/__init__.py +402 -0
- khoj/database/tests.py +3 -0
- khoj/interface/email/feedback.html +34 -0
- khoj/interface/email/magic_link.html +17 -0
- khoj/interface/email/task.html +40 -0
- khoj/interface/email/welcome.html +61 -0
- khoj/interface/web/404.html +56 -0
- khoj/interface/web/agent.html +312 -0
- khoj/interface/web/agents.html +276 -0
- khoj/interface/web/assets/icons/agents.svg +6 -0
- khoj/interface/web/assets/icons/automation.svg +37 -0
- khoj/interface/web/assets/icons/cancel.svg +3 -0
- khoj/interface/web/assets/icons/chat.svg +24 -0
- khoj/interface/web/assets/icons/collapse.svg +17 -0
- khoj/interface/web/assets/icons/computer.png +0 -0
- khoj/interface/web/assets/icons/confirm-icon.svg +1 -0
- khoj/interface/web/assets/icons/copy-button-success.svg +6 -0
- khoj/interface/web/assets/icons/copy-button.svg +5 -0
- khoj/interface/web/assets/icons/credit-card.png +0 -0
- khoj/interface/web/assets/icons/delete.svg +26 -0
- khoj/interface/web/assets/icons/docx.svg +7 -0
- khoj/interface/web/assets/icons/edit.svg +4 -0
- khoj/interface/web/assets/icons/favicon-128x128.ico +0 -0
- khoj/interface/web/assets/icons/favicon-128x128.png +0 -0
- khoj/interface/web/assets/icons/favicon-256x256.png +0 -0
- khoj/interface/web/assets/icons/favicon.icns +0 -0
- khoj/interface/web/assets/icons/github.svg +1 -0
- khoj/interface/web/assets/icons/key.svg +4 -0
- khoj/interface/web/assets/icons/khoj-logo-sideways-200.png +0 -0
- khoj/interface/web/assets/icons/khoj-logo-sideways-500.png +0 -0
- khoj/interface/web/assets/icons/khoj-logo-sideways.svg +5385 -0
- khoj/interface/web/assets/icons/logotype.svg +1 -0
- khoj/interface/web/assets/icons/markdown.svg +1 -0
- khoj/interface/web/assets/icons/new.svg +23 -0
- khoj/interface/web/assets/icons/notion.svg +4 -0
- khoj/interface/web/assets/icons/openai-logomark.svg +1 -0
- khoj/interface/web/assets/icons/org.svg +1 -0
- khoj/interface/web/assets/icons/pdf.svg +23 -0
- khoj/interface/web/assets/icons/pencil-edit.svg +5 -0
- khoj/interface/web/assets/icons/plaintext.svg +1 -0
- khoj/interface/web/assets/icons/question-mark-icon.svg +1 -0
- khoj/interface/web/assets/icons/search.svg +25 -0
- khoj/interface/web/assets/icons/send.svg +1 -0
- khoj/interface/web/assets/icons/share.svg +8 -0
- khoj/interface/web/assets/icons/speaker.svg +4 -0
- khoj/interface/web/assets/icons/stop-solid.svg +37 -0
- khoj/interface/web/assets/icons/sync.svg +4 -0
- khoj/interface/web/assets/icons/thumbs-down-svgrepo-com.svg +6 -0
- khoj/interface/web/assets/icons/thumbs-up-svgrepo-com.svg +6 -0
- khoj/interface/web/assets/icons/user-silhouette.svg +4 -0
- khoj/interface/web/assets/icons/voice.svg +8 -0
- khoj/interface/web/assets/icons/web.svg +2 -0
- khoj/interface/web/assets/icons/whatsapp.svg +17 -0
- khoj/interface/web/assets/khoj.css +237 -0
- khoj/interface/web/assets/markdown-it.min.js +8476 -0
- khoj/interface/web/assets/natural-cron.min.js +1 -0
- khoj/interface/web/assets/org.min.js +1823 -0
- khoj/interface/web/assets/pico.min.css +5 -0
- khoj/interface/web/assets/purify.min.js +3 -0
- khoj/interface/web/assets/samples/desktop-browse-draw-sample.png +0 -0
- khoj/interface/web/assets/samples/desktop-plain-chat-sample.png +0 -0
- khoj/interface/web/assets/samples/desktop-remember-plan-sample.png +0 -0
- khoj/interface/web/assets/samples/phone-browse-draw-sample.png +0 -0
- khoj/interface/web/assets/samples/phone-plain-chat-sample.png +0 -0
- khoj/interface/web/assets/samples/phone-remember-plan-sample.png +0 -0
- khoj/interface/web/assets/utils.js +33 -0
- khoj/interface/web/base_config.html +445 -0
- khoj/interface/web/chat.html +3546 -0
- khoj/interface/web/config.html +1011 -0
- khoj/interface/web/config_automation.html +1103 -0
- khoj/interface/web/content_source_computer_input.html +139 -0
- khoj/interface/web/content_source_github_input.html +216 -0
- khoj/interface/web/content_source_notion_input.html +94 -0
- khoj/interface/web/khoj.webmanifest +51 -0
- khoj/interface/web/login.html +219 -0
- khoj/interface/web/public_conversation.html +2006 -0
- khoj/interface/web/search.html +470 -0
- khoj/interface/web/utils.html +48 -0
- khoj/main.py +241 -0
- khoj/manage.py +22 -0
- khoj/migrations/__init__.py +0 -0
- khoj/migrations/migrate_offline_chat_default_model.py +69 -0
- khoj/migrations/migrate_offline_chat_default_model_2.py +71 -0
- khoj/migrations/migrate_offline_chat_schema.py +83 -0
- khoj/migrations/migrate_offline_model.py +29 -0
- khoj/migrations/migrate_processor_config_openai.py +67 -0
- khoj/migrations/migrate_server_pg.py +138 -0
- khoj/migrations/migrate_version.py +17 -0
- khoj/processor/__init__.py +0 -0
- khoj/processor/content/__init__.py +0 -0
- khoj/processor/content/docx/__init__.py +0 -0
- khoj/processor/content/docx/docx_to_entries.py +110 -0
- khoj/processor/content/github/__init__.py +0 -0
- khoj/processor/content/github/github_to_entries.py +224 -0
- khoj/processor/content/images/__init__.py +0 -0
- khoj/processor/content/images/image_to_entries.py +118 -0
- khoj/processor/content/markdown/__init__.py +0 -0
- khoj/processor/content/markdown/markdown_to_entries.py +165 -0
- khoj/processor/content/notion/notion_to_entries.py +260 -0
- khoj/processor/content/org_mode/__init__.py +0 -0
- khoj/processor/content/org_mode/org_to_entries.py +231 -0
- khoj/processor/content/org_mode/orgnode.py +532 -0
- khoj/processor/content/pdf/__init__.py +0 -0
- khoj/processor/content/pdf/pdf_to_entries.py +116 -0
- khoj/processor/content/plaintext/__init__.py +0 -0
- khoj/processor/content/plaintext/plaintext_to_entries.py +122 -0
- khoj/processor/content/text_to_entries.py +297 -0
- khoj/processor/conversation/__init__.py +0 -0
- khoj/processor/conversation/anthropic/__init__.py +0 -0
- khoj/processor/conversation/anthropic/anthropic_chat.py +206 -0
- khoj/processor/conversation/anthropic/utils.py +114 -0
- khoj/processor/conversation/offline/__init__.py +0 -0
- khoj/processor/conversation/offline/chat_model.py +231 -0
- khoj/processor/conversation/offline/utils.py +78 -0
- khoj/processor/conversation/offline/whisper.py +15 -0
- khoj/processor/conversation/openai/__init__.py +0 -0
- khoj/processor/conversation/openai/gpt.py +187 -0
- khoj/processor/conversation/openai/utils.py +129 -0
- khoj/processor/conversation/openai/whisper.py +13 -0
- khoj/processor/conversation/prompts.py +758 -0
- khoj/processor/conversation/utils.py +262 -0
- khoj/processor/embeddings.py +117 -0
- khoj/processor/speech/__init__.py +0 -0
- khoj/processor/speech/text_to_speech.py +51 -0
- khoj/processor/tools/__init__.py +0 -0
- khoj/processor/tools/online_search.py +225 -0
- khoj/routers/__init__.py +0 -0
- khoj/routers/api.py +626 -0
- khoj/routers/api_agents.py +43 -0
- khoj/routers/api_chat.py +1180 -0
- khoj/routers/api_config.py +434 -0
- khoj/routers/api_phone.py +86 -0
- khoj/routers/auth.py +181 -0
- khoj/routers/email.py +133 -0
- khoj/routers/helpers.py +1188 -0
- khoj/routers/indexer.py +349 -0
- khoj/routers/notion.py +91 -0
- khoj/routers/storage.py +35 -0
- khoj/routers/subscription.py +104 -0
- khoj/routers/twilio.py +36 -0
- khoj/routers/web_client.py +471 -0
- khoj/search_filter/__init__.py +0 -0
- khoj/search_filter/base_filter.py +15 -0
- khoj/search_filter/date_filter.py +217 -0
- khoj/search_filter/file_filter.py +30 -0
- khoj/search_filter/word_filter.py +29 -0
- khoj/search_type/__init__.py +0 -0
- khoj/search_type/text_search.py +241 -0
- khoj/utils/__init__.py +0 -0
- khoj/utils/cli.py +93 -0
- khoj/utils/config.py +81 -0
- khoj/utils/constants.py +24 -0
- khoj/utils/fs_syncer.py +249 -0
- khoj/utils/helpers.py +418 -0
- khoj/utils/initialization.py +146 -0
- khoj/utils/jsonl.py +43 -0
- khoj/utils/models.py +47 -0
- khoj/utils/rawconfig.py +160 -0
- khoj/utils/state.py +46 -0
- khoj/utils/yaml.py +43 -0
- khoj-1.16.1.dev15.dist-info/METADATA +178 -0
- khoj-1.16.1.dev15.dist-info/RECORD +242 -0
- khoj-1.16.1.dev15.dist-info/WHEEL +4 -0
- khoj-1.16.1.dev15.dist-info/entry_points.txt +2 -0
- khoj-1.16.1.dev15.dist-info/licenses/LICENSE +661 -0
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""
|
|
2
|
+
The application config currently looks like this:
|
|
3
|
+
app:
|
|
4
|
+
should-log-telemetry: true
|
|
5
|
+
content-type:
|
|
6
|
+
...
|
|
7
|
+
processor:
|
|
8
|
+
conversation:
|
|
9
|
+
conversation-logfile: ~/.khoj/processor/conversation/conversation_logs.json
|
|
10
|
+
max-prompt-size: null
|
|
11
|
+
offline-chat:
|
|
12
|
+
chat-model: mistral-7b-instruct-v0.1.Q4_0.gguf
|
|
13
|
+
enable-offline-chat: false
|
|
14
|
+
openai:
|
|
15
|
+
api-key: sk-blah
|
|
16
|
+
chat-model: gpt-3.5-turbo
|
|
17
|
+
tokenizer: null
|
|
18
|
+
search-type:
|
|
19
|
+
asymmetric:
|
|
20
|
+
cross-encoder: cross-encoder/ms-marco-MiniLM-L-6-v2
|
|
21
|
+
encoder: sentence-transformers/multi-qa-MiniLM-L6-cos-v1
|
|
22
|
+
encoder-type: null
|
|
23
|
+
model-directory: /Users/si/.khoj/search/asymmetric
|
|
24
|
+
image:
|
|
25
|
+
encoder: sentence-transformers/clip-ViT-B-32
|
|
26
|
+
encoder-type: null
|
|
27
|
+
model-directory: /Users/si/.khoj/search/image
|
|
28
|
+
symmetric:
|
|
29
|
+
cross-encoder: cross-encoder/ms-marco-MiniLM-L-6-v2
|
|
30
|
+
encoder: sentence-transformers/all-MiniLM-L6-v2
|
|
31
|
+
encoder-type: null
|
|
32
|
+
model-directory: ~/.khoj/search/symmetric
|
|
33
|
+
version: 0.14.0
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
The new version will looks like this:
|
|
37
|
+
app:
|
|
38
|
+
should-log-telemetry: true
|
|
39
|
+
processor:
|
|
40
|
+
conversation:
|
|
41
|
+
offline-chat:
|
|
42
|
+
enabled: false
|
|
43
|
+
openai:
|
|
44
|
+
api-key: sk-blah
|
|
45
|
+
chat-model-options:
|
|
46
|
+
- chat-model: gpt-3.5-turbo
|
|
47
|
+
tokenizer: null
|
|
48
|
+
type: openai
|
|
49
|
+
- chat-model: mistral-7b-instruct-v0.1.Q4_0.gguf
|
|
50
|
+
tokenizer: null
|
|
51
|
+
type: offline
|
|
52
|
+
search-type:
|
|
53
|
+
asymmetric:
|
|
54
|
+
cross-encoder: cross-encoder/ms-marco-MiniLM-L-6-v2
|
|
55
|
+
encoder: sentence-transformers/multi-qa-MiniLM-L6-cos-v1
|
|
56
|
+
version: 0.15.0
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
import logging
|
|
60
|
+
|
|
61
|
+
from packaging import version
|
|
62
|
+
|
|
63
|
+
from khoj.database.models import (
|
|
64
|
+
ChatModelOptions,
|
|
65
|
+
OpenAIProcessorConversationConfig,
|
|
66
|
+
SearchModelConfig,
|
|
67
|
+
)
|
|
68
|
+
from khoj.utils.yaml import load_config_from_file, save_config_to_file
|
|
69
|
+
|
|
70
|
+
logger = logging.getLogger(__name__)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def migrate_server_pg(args):
|
|
74
|
+
schema_version = "0.15.0"
|
|
75
|
+
raw_config = load_config_from_file(args.config_file)
|
|
76
|
+
previous_version = raw_config.get("version")
|
|
77
|
+
|
|
78
|
+
if previous_version is None or version.parse(previous_version) < version.parse(schema_version):
|
|
79
|
+
logger.info(
|
|
80
|
+
f"Migrating configuration used for version {previous_version} to latest version for server with postgres in {args.version_no}"
|
|
81
|
+
)
|
|
82
|
+
raw_config["version"] = schema_version
|
|
83
|
+
|
|
84
|
+
if raw_config is None:
|
|
85
|
+
return args
|
|
86
|
+
|
|
87
|
+
if "search-type" in raw_config and raw_config["search-type"]:
|
|
88
|
+
if "asymmetric" in raw_config["search-type"]:
|
|
89
|
+
# Delete all existing search models
|
|
90
|
+
SearchModelConfig.objects.filter(model_type=SearchModelConfig.ModelType.TEXT).delete()
|
|
91
|
+
# Create new search model from existing Khoj YAML config
|
|
92
|
+
asymmetric_search = raw_config["search-type"]["asymmetric"]
|
|
93
|
+
SearchModelConfig.objects.create(
|
|
94
|
+
name="default",
|
|
95
|
+
model_type=SearchModelConfig.ModelType.TEXT,
|
|
96
|
+
bi_encoder=asymmetric_search.get("encoder"),
|
|
97
|
+
cross_encoder=asymmetric_search.get("cross-encoder"),
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
if "processor" in raw_config and raw_config["processor"] and "conversation" in raw_config["processor"]:
|
|
101
|
+
processor_conversation = raw_config["processor"]["conversation"]
|
|
102
|
+
|
|
103
|
+
if "offline-chat" in raw_config["processor"]["conversation"]:
|
|
104
|
+
offline_chat = raw_config["processor"]["conversation"]["offline-chat"]
|
|
105
|
+
ChatModelOptions.objects.create(
|
|
106
|
+
chat_model=offline_chat.get("chat-model"),
|
|
107
|
+
tokenizer=processor_conversation.get("tokenizer"),
|
|
108
|
+
max_prompt_size=processor_conversation.get("max-prompt-size"),
|
|
109
|
+
model_type=ChatModelOptions.ModelType.OFFLINE,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
if (
|
|
113
|
+
"openai" in raw_config["processor"]["conversation"]
|
|
114
|
+
and raw_config["processor"]["conversation"]["openai"]
|
|
115
|
+
):
|
|
116
|
+
openai = raw_config["processor"]["conversation"]["openai"]
|
|
117
|
+
|
|
118
|
+
if openai.get("api-key") is None:
|
|
119
|
+
logger.error("OpenAI API Key is not set. Will not be migrating OpenAI config.")
|
|
120
|
+
else:
|
|
121
|
+
if openai.get("chat-model") is None:
|
|
122
|
+
openai["chat-model"] = "gpt-3.5-turbo"
|
|
123
|
+
|
|
124
|
+
openai_config = OpenAIProcessorConversationConfig.objects.create(
|
|
125
|
+
api_key=openai.get("api-key"), name="default"
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
ChatModelOptions.objects.create(
|
|
129
|
+
chat_model=openai.get("chat-model"),
|
|
130
|
+
tokenizer=processor_conversation.get("tokenizer"),
|
|
131
|
+
max_prompt_size=processor_conversation.get("max-prompt-size"),
|
|
132
|
+
model_type=ChatModelOptions.ModelType.OPENAI,
|
|
133
|
+
openai_config=openai_config,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
save_config_to_file(raw_config, args.config_file)
|
|
137
|
+
|
|
138
|
+
return args
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from khoj.utils.yaml import load_config_from_file, save_config_to_file
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def migrate_config_to_version(args):
|
|
5
|
+
schema_version = "0.9.0"
|
|
6
|
+
raw_config = load_config_from_file(args.config_file)
|
|
7
|
+
|
|
8
|
+
# Add version to khoj config schema
|
|
9
|
+
if "version" not in raw_config:
|
|
10
|
+
raw_config["version"] = schema_version
|
|
11
|
+
save_config_to_file(raw_config, args.config_file)
|
|
12
|
+
|
|
13
|
+
# regenerate khoj index on first start of this version
|
|
14
|
+
# this should refresh index and apply index corruption fixes from #325
|
|
15
|
+
args.regenerate = True
|
|
16
|
+
|
|
17
|
+
return args
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Dict, List, Tuple
|
|
5
|
+
|
|
6
|
+
from langchain_community.document_loaders import Docx2txtLoader
|
|
7
|
+
|
|
8
|
+
from khoj.database.models import Entry as DbEntry
|
|
9
|
+
from khoj.database.models import KhojUser
|
|
10
|
+
from khoj.processor.content.text_to_entries import TextToEntries
|
|
11
|
+
from khoj.utils.helpers import timer
|
|
12
|
+
from khoj.utils.rawconfig import Entry
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DocxToEntries(TextToEntries):
|
|
18
|
+
def __init__(self):
|
|
19
|
+
super().__init__()
|
|
20
|
+
|
|
21
|
+
# Define Functions
|
|
22
|
+
def process(
|
|
23
|
+
self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
|
|
24
|
+
) -> Tuple[int, int]:
|
|
25
|
+
# Extract required fields from config
|
|
26
|
+
if not full_corpus:
|
|
27
|
+
deletion_file_names = set([file for file in files if files[file] == b""])
|
|
28
|
+
files_to_process = set(files) - deletion_file_names
|
|
29
|
+
files = {file: files[file] for file in files_to_process}
|
|
30
|
+
else:
|
|
31
|
+
deletion_file_names = None
|
|
32
|
+
|
|
33
|
+
# Extract Entries from specified Docx files
|
|
34
|
+
with timer("Extract entries from specified DOCX files", logger):
|
|
35
|
+
file_to_text_map, current_entries = DocxToEntries.extract_docx_entries(files)
|
|
36
|
+
|
|
37
|
+
# Split entries by max tokens supported by model
|
|
38
|
+
with timer("Split entries by max token size supported by model", logger):
|
|
39
|
+
current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256)
|
|
40
|
+
|
|
41
|
+
# Identify, mark and merge any new entries with previous entries
|
|
42
|
+
with timer("Identify new or updated entries", logger):
|
|
43
|
+
num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
|
|
44
|
+
current_entries,
|
|
45
|
+
DbEntry.EntryType.DOCX,
|
|
46
|
+
DbEntry.EntrySource.COMPUTER,
|
|
47
|
+
"compiled",
|
|
48
|
+
logger,
|
|
49
|
+
deletion_file_names,
|
|
50
|
+
user,
|
|
51
|
+
regenerate=regenerate,
|
|
52
|
+
file_to_text_map=file_to_text_map,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
return num_new_embeddings, num_deleted_embeddings
|
|
56
|
+
|
|
57
|
+
@staticmethod
|
|
58
|
+
def extract_docx_entries(docx_files) -> Tuple[Dict, List[Entry]]:
|
|
59
|
+
"""Extract entries from specified DOCX files"""
|
|
60
|
+
|
|
61
|
+
entries: List[str] = []
|
|
62
|
+
entry_to_location_map: List[Tuple[str, str]] = []
|
|
63
|
+
file_to_text_map = dict()
|
|
64
|
+
for docx_file in docx_files:
|
|
65
|
+
try:
|
|
66
|
+
timestamp_now = datetime.utcnow().timestamp()
|
|
67
|
+
tmp_file = f"tmp_docx_file_{timestamp_now}.docx"
|
|
68
|
+
with open(tmp_file, "wb") as f:
|
|
69
|
+
bytes_content = docx_files[docx_file]
|
|
70
|
+
f.write(bytes_content)
|
|
71
|
+
|
|
72
|
+
# Load the content using Docx2txtLoader
|
|
73
|
+
loader = Docx2txtLoader(tmp_file)
|
|
74
|
+
docx_entries_per_file = loader.load()
|
|
75
|
+
|
|
76
|
+
# Convert the loaded entries into the desired format
|
|
77
|
+
docx_texts = [page.page_content for page in docx_entries_per_file]
|
|
78
|
+
|
|
79
|
+
entry_to_location_map += zip(docx_texts, [docx_file] * len(docx_texts))
|
|
80
|
+
entries.extend(docx_texts)
|
|
81
|
+
file_to_text_map[docx_file] = docx_texts
|
|
82
|
+
except Exception as e:
|
|
83
|
+
logger.warning(f"Unable to process file: {docx_file}. This file will not be indexed.")
|
|
84
|
+
logger.warning(e, exc_info=True)
|
|
85
|
+
finally:
|
|
86
|
+
if os.path.exists(f"{tmp_file}"):
|
|
87
|
+
os.remove(f"{tmp_file}")
|
|
88
|
+
return file_to_text_map, DocxToEntries.convert_docx_entries_to_maps(entries, dict(entry_to_location_map))
|
|
89
|
+
|
|
90
|
+
@staticmethod
|
|
91
|
+
def convert_docx_entries_to_maps(parsed_entries: List[str], entry_to_file_map) -> List[Entry]:
|
|
92
|
+
"""Convert each DOCX entry into a dictionary"""
|
|
93
|
+
entries = []
|
|
94
|
+
for parsed_entry in parsed_entries:
|
|
95
|
+
entry_filename = entry_to_file_map[parsed_entry]
|
|
96
|
+
# Append base filename to compiled entry for context to model
|
|
97
|
+
heading = f"{entry_filename}\n"
|
|
98
|
+
compiled_entry = f"{heading}{parsed_entry}"
|
|
99
|
+
entries.append(
|
|
100
|
+
Entry(
|
|
101
|
+
compiled=compiled_entry,
|
|
102
|
+
raw=parsed_entry,
|
|
103
|
+
heading=heading,
|
|
104
|
+
file=f"{entry_filename}",
|
|
105
|
+
)
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
logger.debug(f"Converted {len(parsed_entries)} DOCX entries to dictionaries")
|
|
109
|
+
|
|
110
|
+
return entries
|
|
File without changes
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import time
|
|
3
|
+
from typing import Any, Dict, List, Tuple
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
from magika import Magika
|
|
7
|
+
|
|
8
|
+
from khoj.database.models import Entry as DbEntry
|
|
9
|
+
from khoj.database.models import GithubConfig, KhojUser
|
|
10
|
+
from khoj.processor.content.markdown.markdown_to_entries import MarkdownToEntries
|
|
11
|
+
from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
|
|
12
|
+
from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries
|
|
13
|
+
from khoj.processor.content.text_to_entries import TextToEntries
|
|
14
|
+
from khoj.utils.helpers import timer
|
|
15
|
+
from khoj.utils.rawconfig import GithubContentConfig, GithubRepoConfig
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
magika = Magika()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class GithubToEntries(TextToEntries):
|
|
22
|
+
def __init__(self, config: GithubConfig):
|
|
23
|
+
super().__init__(config)
|
|
24
|
+
raw_repos = config.githubrepoconfig.all()
|
|
25
|
+
repos = []
|
|
26
|
+
for repo in raw_repos:
|
|
27
|
+
repos.append(
|
|
28
|
+
GithubRepoConfig(
|
|
29
|
+
name=repo.name,
|
|
30
|
+
owner=repo.owner,
|
|
31
|
+
branch=repo.branch,
|
|
32
|
+
)
|
|
33
|
+
)
|
|
34
|
+
self.config = GithubContentConfig(
|
|
35
|
+
pat_token=config.pat_token,
|
|
36
|
+
repos=repos,
|
|
37
|
+
)
|
|
38
|
+
self.session = requests.Session()
|
|
39
|
+
self.session.headers.update({"Authorization": f"token {self.config.pat_token}"})
|
|
40
|
+
|
|
41
|
+
@staticmethod
|
|
42
|
+
def wait_for_rate_limit_reset(response, func, *args, **kwargs):
|
|
43
|
+
if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0":
|
|
44
|
+
wait_time = int(response.headers.get("X-RateLimit-Reset")) - int(time.time())
|
|
45
|
+
logger.info(f"Github Rate limit reached. Waiting for {wait_time} seconds")
|
|
46
|
+
time.sleep(wait_time)
|
|
47
|
+
return func(*args, **kwargs)
|
|
48
|
+
else:
|
|
49
|
+
return
|
|
50
|
+
|
|
51
|
+
def process(
|
|
52
|
+
self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
|
|
53
|
+
) -> Tuple[int, int]:
|
|
54
|
+
if self.config.pat_token is None or self.config.pat_token == "":
|
|
55
|
+
logger.error(f"Github PAT token is not set. Skipping github content")
|
|
56
|
+
raise ValueError("Github PAT token is not set. Skipping github content")
|
|
57
|
+
current_entries = []
|
|
58
|
+
for repo in self.config.repos:
|
|
59
|
+
current_entries += self.process_repo(repo)
|
|
60
|
+
|
|
61
|
+
return self.update_entries_with_ids(current_entries, user=user)
|
|
62
|
+
|
|
63
|
+
def process_repo(self, repo: GithubRepoConfig):
|
|
64
|
+
repo_url = f"https://api.github.com/repos/{repo.owner}/{repo.name}"
|
|
65
|
+
repo_shorthand = f"{repo.owner}/{repo.name}"
|
|
66
|
+
logger.info(f"Processing github repo {repo_shorthand}")
|
|
67
|
+
with timer("Download files from github repo", logger):
|
|
68
|
+
try:
|
|
69
|
+
markdown_files, org_files, plaintext_files = self.get_files(repo_url, repo)
|
|
70
|
+
except ConnectionAbortedError as e:
|
|
71
|
+
logger.error(f"Github rate limit reached. Skip indexing github repo {repo_shorthand}")
|
|
72
|
+
raise e
|
|
73
|
+
except Exception as e:
|
|
74
|
+
logger.error(f"Unable to download github repo {repo_shorthand}", exc_info=True)
|
|
75
|
+
raise e
|
|
76
|
+
|
|
77
|
+
logger.info(
|
|
78
|
+
f"Found {len(markdown_files)} md, {len(org_files)} org and {len(plaintext_files)} text files in github repo {repo_shorthand}"
|
|
79
|
+
)
|
|
80
|
+
current_entries = []
|
|
81
|
+
|
|
82
|
+
with timer(f"Extract markdown entries from github repo {repo_shorthand}", logger):
|
|
83
|
+
current_entries = MarkdownToEntries.convert_markdown_entries_to_maps(
|
|
84
|
+
*GithubToEntries.extract_markdown_entries(markdown_files)
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
with timer(f"Extract org entries from github repo {repo_shorthand}", logger):
|
|
88
|
+
current_entries += OrgToEntries.convert_org_nodes_to_entries(
|
|
89
|
+
*GithubToEntries.extract_org_entries(org_files)
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
with timer(f"Extract plaintext entries from github repo {repo_shorthand}", logger):
|
|
93
|
+
current_entries += PlaintextToEntries.convert_text_files_to_entries(
|
|
94
|
+
*GithubToEntries.extract_plaintext_entries(plaintext_files)
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
with timer(f"Split entries by max token size supported by model {repo_shorthand}", logger):
|
|
98
|
+
current_entries = TextToEntries.split_entries_by_max_tokens(current_entries, max_tokens=256)
|
|
99
|
+
|
|
100
|
+
return current_entries
|
|
101
|
+
|
|
102
|
+
def update_entries_with_ids(self, current_entries, user: KhojUser = None):
|
|
103
|
+
# Identify, mark and merge any new entries with previous entries
|
|
104
|
+
with timer("Identify new or updated entries", logger):
|
|
105
|
+
num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
|
|
106
|
+
current_entries,
|
|
107
|
+
DbEntry.EntryType.GITHUB,
|
|
108
|
+
DbEntry.EntrySource.GITHUB,
|
|
109
|
+
key="compiled",
|
|
110
|
+
logger=logger,
|
|
111
|
+
user=user,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
return num_new_embeddings, num_deleted_embeddings
|
|
115
|
+
|
|
116
|
+
def get_files(self, repo_url: str, repo: GithubRepoConfig):
|
|
117
|
+
# Get the contents of the repository
|
|
118
|
+
repo_content_url = f"{repo_url}/git/trees/{repo.branch}"
|
|
119
|
+
headers = {"Authorization": f"token {self.config.pat_token}"}
|
|
120
|
+
params = {"recursive": "true"}
|
|
121
|
+
response = requests.get(repo_content_url, headers=headers, params=params)
|
|
122
|
+
contents = response.json()
|
|
123
|
+
|
|
124
|
+
# Raise exception if hit rate limit
|
|
125
|
+
if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0":
|
|
126
|
+
raise ConnectionAbortedError("Github rate limit reached")
|
|
127
|
+
|
|
128
|
+
# Extract markdown files from the repository
|
|
129
|
+
markdown_files: List[Dict[str, str]] = []
|
|
130
|
+
org_files: List[Dict[str, str]] = []
|
|
131
|
+
plaintext_files: List[Dict[str, str]] = []
|
|
132
|
+
if "tree" not in contents:
|
|
133
|
+
return markdown_files, org_files, plaintext_files
|
|
134
|
+
|
|
135
|
+
for item in contents["tree"]:
|
|
136
|
+
# Find all markdown files in the repository
|
|
137
|
+
if item["type"] == "blob" and item["path"].endswith(".md"):
|
|
138
|
+
# Create URL for each markdown file on Github
|
|
139
|
+
url_path = f'https://github.com/{repo.owner}/{repo.name}/blob/{repo.branch}/{item["path"]}'
|
|
140
|
+
|
|
141
|
+
# Add markdown file contents and URL to list
|
|
142
|
+
markdown_files += [{"content": self.get_file_contents(item["url"]), "path": url_path}]
|
|
143
|
+
|
|
144
|
+
# Find all org files in the repository
|
|
145
|
+
elif item["type"] == "blob" and item["path"].endswith(".org"):
|
|
146
|
+
# Create URL for each org file on Github
|
|
147
|
+
url_path = f'https://github.com/{repo.owner}/{repo.name}/blob/{repo.branch}/{item["path"]}'
|
|
148
|
+
|
|
149
|
+
# Add org file contents and URL to list
|
|
150
|
+
org_files += [{"content": self.get_file_contents(item["url"]), "path": url_path}]
|
|
151
|
+
|
|
152
|
+
# Find, index remaining non-binary files in the repository
|
|
153
|
+
elif item["type"] == "blob":
|
|
154
|
+
url_path = f'https://github.com/{repo.owner}/{repo.name}/blob/{repo.branch}/{item["path"]}'
|
|
155
|
+
content_bytes = self.get_file_contents(item["url"], decode=False)
|
|
156
|
+
content_type, content_str = None, None
|
|
157
|
+
try:
|
|
158
|
+
content_type = magika.identify_bytes(content_bytes).output.group
|
|
159
|
+
except:
|
|
160
|
+
logger.error(f"Unable to identify content type of file at {url_path}. Skip indexing it")
|
|
161
|
+
continue
|
|
162
|
+
|
|
163
|
+
# Add non-binary file contents and URL to list
|
|
164
|
+
if content_type in ["text", "code"]:
|
|
165
|
+
try:
|
|
166
|
+
content_str = content_bytes.decode("utf-8")
|
|
167
|
+
except:
|
|
168
|
+
logger.error(f"Unable to decode content of file at {url_path}. Skip indexing it")
|
|
169
|
+
continue
|
|
170
|
+
plaintext_files += [{"content": content_str, "path": url_path}]
|
|
171
|
+
|
|
172
|
+
return markdown_files, org_files, plaintext_files
|
|
173
|
+
|
|
174
|
+
def get_file_contents(self, file_url, decode=True):
|
|
175
|
+
# Get text from each markdown file
|
|
176
|
+
headers = {"Accept": "application/vnd.github.v3.raw"}
|
|
177
|
+
response = self.session.get(file_url, headers=headers, stream=True)
|
|
178
|
+
|
|
179
|
+
# Stop indexing on hitting rate limit
|
|
180
|
+
if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0":
|
|
181
|
+
raise ConnectionAbortedError("Github rate limit reached")
|
|
182
|
+
|
|
183
|
+
content = "" if decode else b""
|
|
184
|
+
for chunk in response.iter_content(chunk_size=2048):
|
|
185
|
+
if chunk:
|
|
186
|
+
try:
|
|
187
|
+
content += chunk.decode("utf-8") if decode else chunk
|
|
188
|
+
except Exception as e:
|
|
189
|
+
logger.error(f"Unable to decode chunk from {file_url}")
|
|
190
|
+
logger.error(e)
|
|
191
|
+
|
|
192
|
+
return content
|
|
193
|
+
|
|
194
|
+
@staticmethod
|
|
195
|
+
def extract_markdown_entries(markdown_files):
|
|
196
|
+
entries = []
|
|
197
|
+
entry_to_file_map = []
|
|
198
|
+
for doc in markdown_files:
|
|
199
|
+
entries, entry_to_file_map = MarkdownToEntries.process_single_markdown_file(
|
|
200
|
+
doc["content"], doc["path"], entries, entry_to_file_map
|
|
201
|
+
)
|
|
202
|
+
return entries, dict(entry_to_file_map)
|
|
203
|
+
|
|
204
|
+
@staticmethod
|
|
205
|
+
def extract_org_entries(org_files):
|
|
206
|
+
entries = []
|
|
207
|
+
entry_to_file_map = []
|
|
208
|
+
|
|
209
|
+
for doc in org_files:
|
|
210
|
+
entries, entry_to_file_map = OrgToEntries.process_single_org_file(
|
|
211
|
+
doc["content"], doc["path"], entries, entry_to_file_map
|
|
212
|
+
)
|
|
213
|
+
return entries, dict(entry_to_file_map)
|
|
214
|
+
|
|
215
|
+
@staticmethod
|
|
216
|
+
def extract_plaintext_entries(plaintext_files):
|
|
217
|
+
entries = []
|
|
218
|
+
entry_to_file_map = []
|
|
219
|
+
|
|
220
|
+
for doc in plaintext_files:
|
|
221
|
+
entries, entry_to_file_map = PlaintextToEntries.process_single_plaintext_file(
|
|
222
|
+
doc["content"], doc["path"], entries, entry_to_file_map
|
|
223
|
+
)
|
|
224
|
+
return entries, dict(entry_to_file_map)
|
|
File without changes
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Dict, List, Tuple
|
|
6
|
+
|
|
7
|
+
from rapidocr_onnxruntime import RapidOCR
|
|
8
|
+
|
|
9
|
+
from khoj.database.models import Entry as DbEntry
|
|
10
|
+
from khoj.database.models import KhojUser
|
|
11
|
+
from khoj.processor.content.text_to_entries import TextToEntries
|
|
12
|
+
from khoj.utils.helpers import timer
|
|
13
|
+
from khoj.utils.rawconfig import Entry
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ImageToEntries(TextToEntries):
|
|
19
|
+
def __init__(self):
|
|
20
|
+
super().__init__()
|
|
21
|
+
|
|
22
|
+
# Define Functions
|
|
23
|
+
def process(
|
|
24
|
+
self, files: dict[str, str] = None, full_corpus: bool = True, user: KhojUser = None, regenerate: bool = False
|
|
25
|
+
) -> Tuple[int, int]:
|
|
26
|
+
# Extract required fields from config
|
|
27
|
+
if not full_corpus:
|
|
28
|
+
deletion_file_names = set([file for file in files if files[file] == b""])
|
|
29
|
+
files_to_process = set(files) - deletion_file_names
|
|
30
|
+
files = {file: files[file] for file in files_to_process}
|
|
31
|
+
else:
|
|
32
|
+
deletion_file_names = None
|
|
33
|
+
|
|
34
|
+
# Extract Entries from specified image files
|
|
35
|
+
with timer("Extract entries from specified Image files", logger):
|
|
36
|
+
file_to_text_map, current_entries = ImageToEntries.extract_image_entries(files)
|
|
37
|
+
|
|
38
|
+
# Split entries by max tokens supported by model
|
|
39
|
+
with timer("Split entries by max token size supported by model", logger):
|
|
40
|
+
current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256)
|
|
41
|
+
|
|
42
|
+
# Identify, mark and merge any new entries with previous entries
|
|
43
|
+
with timer("Identify new or updated entries", logger):
|
|
44
|
+
num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
|
|
45
|
+
current_entries,
|
|
46
|
+
DbEntry.EntryType.IMAGE,
|
|
47
|
+
DbEntry.EntrySource.COMPUTER,
|
|
48
|
+
"compiled",
|
|
49
|
+
logger,
|
|
50
|
+
deletion_file_names,
|
|
51
|
+
user,
|
|
52
|
+
regenerate=regenerate,
|
|
53
|
+
file_to_text_map=file_to_text_map,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
return num_new_embeddings, num_deleted_embeddings
|
|
57
|
+
|
|
58
|
+
@staticmethod
|
|
59
|
+
def extract_image_entries(image_files) -> Tuple[Dict, List[Entry]]: # important function
|
|
60
|
+
"""Extract entries by page from specified image files"""
|
|
61
|
+
file_to_text_map = dict()
|
|
62
|
+
entries: List[str] = []
|
|
63
|
+
entry_to_location_map: List[Tuple[str, str]] = []
|
|
64
|
+
for image_file in image_files:
|
|
65
|
+
try:
|
|
66
|
+
loader = RapidOCR()
|
|
67
|
+
bytes = image_files[image_file]
|
|
68
|
+
# write the image to a temporary file
|
|
69
|
+
timestamp_now = datetime.utcnow().timestamp()
|
|
70
|
+
# use either png or jpg
|
|
71
|
+
if image_file.endswith(".png"):
|
|
72
|
+
tmp_file = f"tmp_image_file_{timestamp_now}.png"
|
|
73
|
+
elif image_file.endswith(".jpg") or image_file.endswith(".jpeg"):
|
|
74
|
+
tmp_file = f"tmp_image_file_{timestamp_now}.jpg"
|
|
75
|
+
with open(tmp_file, "wb") as f:
|
|
76
|
+
bytes = image_files[image_file]
|
|
77
|
+
f.write(bytes)
|
|
78
|
+
try:
|
|
79
|
+
image_entries_per_file = ""
|
|
80
|
+
result, _ = loader(tmp_file)
|
|
81
|
+
if result:
|
|
82
|
+
expanded_entries = [text[1] for text in result]
|
|
83
|
+
image_entries_per_file = " ".join(expanded_entries)
|
|
84
|
+
except ImportError:
|
|
85
|
+
logger.warning(f"Unable to process file: {image_file}. This file will not be indexed.")
|
|
86
|
+
continue
|
|
87
|
+
entry_to_location_map.append((image_entries_per_file, image_file))
|
|
88
|
+
entries.extend([image_entries_per_file])
|
|
89
|
+
file_to_text_map[image_file] = image_entries_per_file
|
|
90
|
+
except Exception as e:
|
|
91
|
+
logger.warning(f"Unable to process file: {image_file}. This file will not be indexed.")
|
|
92
|
+
logger.warning(e, exc_info=True)
|
|
93
|
+
finally:
|
|
94
|
+
if os.path.exists(tmp_file):
|
|
95
|
+
os.remove(tmp_file)
|
|
96
|
+
return file_to_text_map, ImageToEntries.convert_image_entries_to_maps(entries, dict(entry_to_location_map))
|
|
97
|
+
|
|
98
|
+
@staticmethod
|
|
99
|
+
def convert_image_entries_to_maps(parsed_entries: List[str], entry_to_file_map) -> List[Entry]:
|
|
100
|
+
"Convert each image entries into a dictionary"
|
|
101
|
+
entries = []
|
|
102
|
+
for parsed_entry in parsed_entries:
|
|
103
|
+
entry_filename = entry_to_file_map[parsed_entry]
|
|
104
|
+
# Append base filename to compiled entry for context to model
|
|
105
|
+
heading = f"{entry_filename}\n"
|
|
106
|
+
compiled_entry = f"{heading}{parsed_entry}"
|
|
107
|
+
entries.append(
|
|
108
|
+
Entry(
|
|
109
|
+
compiled=compiled_entry,
|
|
110
|
+
raw=parsed_entry,
|
|
111
|
+
heading=heading,
|
|
112
|
+
file=f"{entry_filename}",
|
|
113
|
+
)
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
logger.debug(f"Converted {len(parsed_entries)} image entries to dictionaries")
|
|
117
|
+
|
|
118
|
+
return entries
|
|
File without changes
|