pembot 0.0.3__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pembot might be problematic. Click here for more details.
- pembot/.git/COMMIT_EDITMSG +1 -0
- pembot/.git/HEAD +1 -0
- pembot/.git/config +11 -0
- pembot/.git/description +1 -0
- pembot/.git/hooks/applypatch-msg.sample +15 -0
- pembot/.git/hooks/commit-msg.sample +24 -0
- pembot/.git/hooks/fsmonitor-watchman.sample +174 -0
- pembot/.git/hooks/post-update.sample +8 -0
- pembot/.git/hooks/pre-applypatch.sample +14 -0
- pembot/.git/hooks/pre-commit.sample +49 -0
- pembot/.git/hooks/pre-merge-commit.sample +13 -0
- pembot/.git/hooks/pre-push.sample +53 -0
- pembot/.git/hooks/pre-rebase.sample +169 -0
- pembot/.git/hooks/pre-receive.sample +24 -0
- pembot/.git/hooks/prepare-commit-msg.sample +42 -0
- pembot/.git/hooks/push-to-checkout.sample +78 -0
- pembot/.git/hooks/sendemail-validate.sample +77 -0
- pembot/.git/hooks/update.sample +128 -0
- pembot/.git/index +0 -0
- pembot/.git/info/exclude +6 -0
- pembot/.git/logs/HEAD +6 -0
- pembot/.git/logs/refs/heads/main +6 -0
- pembot/.git/logs/refs/remotes/origin/HEAD +1 -0
- pembot/.git/logs/refs/remotes/origin/main +5 -0
- pembot/.git/objects/0a/fb3a98cdc55b1434b44534ec2bf22c56cfa26c +0 -0
- pembot/.git/objects/0c/8d9b2690545bf1906b05cd9f18b783b3eb74f1 +0 -0
- pembot/.git/objects/18/28e18ab80aa64d334b26428708140e280cbc63 +0 -0
- pembot/.git/objects/19/f61df7dbd562d04f561288677bbf2f18f5dff7 +0 -0
- pembot/.git/objects/28/db0ab48059acccd7d257aa02e52e9b6b83a4a5 +0 -0
- pembot/.git/objects/35/97e518a8658280be9f377f78edf1dfa1f23814 +0 -0
- pembot/.git/objects/3d/07d3b29ff53d95de3898fb786d61732f210515 +0 -0
- pembot/.git/objects/3e/cf23eb95123287531d708a21d4ba88d92ccabb +0 -0
- pembot/.git/objects/3f/78215d7e17da726fb352fd92b3c117db9b63ba +0 -0
- pembot/.git/objects/3f/e072cf3cb6a9f30c3e9936e3ddf622e80270d0 +0 -0
- pembot/.git/objects/51/9e780574933d7627a083222bd10dd74f430904 +0 -0
- pembot/.git/objects/61/46a371b9c1bd9f51af273f11f986cfd1bedeba +0 -0
- pembot/.git/objects/64/00040794955d17c9a1fe1aaaea59f2c4822177 +0 -0
- pembot/.git/objects/6d/7a865a23b1cb4182f67907820104ced48b11c9 +0 -0
- pembot/.git/objects/72/f047cda92abcd1ddc857f6461de605f8668331 +0 -0
- pembot/.git/objects/73/2e98f08bc806c331b06847fc8c743f545499e5 +0 -0
- pembot/.git/objects/86/cdaec229f1fbebf43042266b03878944669f25 +0 -0
- pembot/.git/objects/87/d6df5217a4a374f8c1211a05f9bd657f72c9a7 +0 -0
- pembot/.git/objects/8b/5be2af9b16f290549193859c214cd9072212e8 +0 -0
- pembot/.git/objects/93/8f29d9b4b1ae86e39dddf9e3d115a82ddfc9b6 +0 -0
- pembot/.git/objects/9b/123713e30fc9e225f9ac8ff5b02f8f8cf86456 +0 -0
- pembot/.git/objects/ab/c6b15265171457b41e2cfdaf3b8c3994a59eb7 +0 -0
- pembot/.git/objects/ac/9c9018c62fa30dc142665c1b5a375f4e056880 +0 -0
- pembot/.git/objects/b1/1173d9b68db117437ccb9551461152e1e8a77d +0 -0
- pembot/.git/objects/b2/4e79ab07fe9e68781961a25ff9f1dbb1546fbb +0 -0
- pembot/.git/objects/b8/eea52176ffa4d88c5a9976bee26092421565d3 +0 -0
- pembot/.git/objects/bf/32a7e6872e5dc4025ee3df3c921ec7ade0855f +0 -0
- pembot/.git/objects/c0/793458db6e1bee7f79f1a504fb8ff4963f8ed3 +0 -0
- pembot/.git/objects/c2/443060c07101948487cfa93cc39e082e9e0f5f +0 -0
- pembot/.git/objects/e5/3070f2b07f45d031444b09b1b38658f3caf29e +0 -0
- pembot/.git/objects/e7/911a702079a6144997ea4e70f59abbe59ec2bc +0 -0
- pembot/.git/objects/e9/1172752e9a421ae463112d2b0506b37498c98d +0 -0
- pembot/.git/objects/ea/0af89e61a882c5afc2a8c281b2d96f174bfe58 +0 -0
- pembot/.git/objects/eb/75e1c49f1e5b79dca17ccdbec8067756523238 +0 -0
- pembot/.git/objects/f1/655afa1c5636c8d58969e3194bb770aefbc552 +0 -0
- pembot/.git/objects/f4/e991088a63def67a30a2b8bbdb4d58514abab8 +0 -0
- pembot/.git/objects/f8/cbb5bfd1503e66cec2c593362c60a317b6d300 +0 -0
- pembot/.git/objects/f9/98e1f01c2bf0a20159fc851327af05beb3ac88 +0 -0
- pembot/.git/objects/fa/9c9a62ec1203a5868b033ded428c2382c4e1b6 +0 -0
- pembot/.git/objects/fb/6c90c9ce5e0cdfbe074a3f060afc66f62eefde +0 -0
- pembot/.git/objects/fc/e56f1e09d09a05b9babf796fb40bece176f3a2 +0 -0
- pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.idx +0 -0
- pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.pack +0 -0
- pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.rev +0 -0
- pembot/.git/packed-refs +2 -0
- pembot/.git/refs/heads/main +1 -0
- pembot/.git/refs/remotes/origin/HEAD +1 -0
- pembot/.git/refs/remotes/origin/main +1 -0
- pembot/.gitignore +7 -0
- pembot/AnyToText/__init__.py +0 -0
- pembot/AnyToText/convertor.py +260 -0
- pembot/LICENSE +674 -0
- pembot/TextEmbedder/__init__.py +0 -0
- pembot/TextEmbedder/gemini_embedder.py +27 -0
- pembot/TextEmbedder/mongodb_embedder.py +258 -0
- pembot/TextEmbedder/mongodb_index_creator.py +133 -0
- pembot/TextEmbedder/vector_query.py +64 -0
- pembot/__init__.py +6 -0
- pembot/config/config.yaml +5 -0
- pembot/gartner.py +140 -0
- pembot/main.py +208 -0
- pembot/output_structure_local.py +63 -0
- pembot/pdf2markdown/.git/HEAD +1 -0
- pembot/pdf2markdown/.git/config +11 -0
- pembot/pdf2markdown/.git/description +1 -0
- pembot/pdf2markdown/.git/hooks/applypatch-msg.sample +15 -0
- pembot/pdf2markdown/.git/hooks/commit-msg.sample +24 -0
- pembot/pdf2markdown/.git/hooks/fsmonitor-watchman.sample +174 -0
- pembot/pdf2markdown/.git/hooks/post-update.sample +8 -0
- pembot/pdf2markdown/.git/hooks/pre-applypatch.sample +14 -0
- pembot/pdf2markdown/.git/hooks/pre-commit.sample +49 -0
- pembot/pdf2markdown/.git/hooks/pre-merge-commit.sample +13 -0
- pembot/pdf2markdown/.git/hooks/pre-push.sample +53 -0
- pembot/pdf2markdown/.git/hooks/pre-rebase.sample +169 -0
- pembot/pdf2markdown/.git/hooks/pre-receive.sample +24 -0
- pembot/pdf2markdown/.git/hooks/prepare-commit-msg.sample +42 -0
- pembot/pdf2markdown/.git/hooks/push-to-checkout.sample +78 -0
- pembot/pdf2markdown/.git/hooks/sendemail-validate.sample +77 -0
- pembot/pdf2markdown/.git/hooks/update.sample +128 -0
- pembot/pdf2markdown/.git/index +0 -0
- pembot/pdf2markdown/.git/info/exclude +6 -0
- pembot/pdf2markdown/.git/logs/HEAD +1 -0
- pembot/pdf2markdown/.git/logs/refs/heads/main +1 -0
- pembot/pdf2markdown/.git/logs/refs/remotes/origin/HEAD +1 -0
- pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.idx +0 -0
- pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.pack +0 -0
- pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.rev +0 -0
- pembot/pdf2markdown/.git/packed-refs +2 -0
- pembot/pdf2markdown/.git/refs/heads/main +1 -0
- pembot/pdf2markdown/.git/refs/remotes/origin/HEAD +1 -0
- pembot/pdf2markdown/LICENSE +21 -0
- pembot/pdf2markdown/README.md +107 -0
- pembot/pdf2markdown/__init__.py +0 -0
- pembot/pdf2markdown/config/config.yaml +2 -0
- pembot/pdf2markdown/extract.py +888 -0
- pembot/pdf2markdown/requirements.txt +8 -0
- pembot/pem.py +157 -0
- pembot/query.py +204 -0
- pembot/utils/__init__.py +0 -0
- pembot/utils/inference_client.py +132 -0
- pembot/utils/string_tools.py +45 -0
- pembot-0.0.3.dist-info/METADATA +8 -0
- pembot-0.0.3.dist-info/RECORD +129 -0
- pembot-0.0.3.dist-info/WHEEL +5 -0
- pembot-0.0.3.dist-info/licenses/LICENSE +674 -0
pembot/main.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from huggingface_hub import InferenceClient
|
|
4
|
+
from huggingface_hub.inference._providers import PROVIDER_T
|
|
5
|
+
import ollama
|
|
6
|
+
from pymongo import MongoClient
|
|
7
|
+
from pembot.AnyToText.convertor import Convertor
|
|
8
|
+
from pembot.TextEmbedder.mongodb_embedder import process_document_and_embed
|
|
9
|
+
from pembot.query import rag_query_llm, remove_bs
|
|
10
|
+
import os
|
|
11
|
+
import json
|
|
12
|
+
from pembot.utils.string_tools import make_it_an_id
|
|
13
|
+
from schema.structure import required_fields
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def make_query(required_fields: list[tuple[str, str, str, str]]):
|
|
17
|
+
"""
|
|
18
|
+
Makes a query to get json in the form of required fields
|
|
19
|
+
"""
|
|
20
|
+
# Construct the part of the prompt that defines the desired JSON structure
|
|
21
|
+
json_structure_definition = "{\n"
|
|
22
|
+
for i, field in enumerate(required_fields):
|
|
23
|
+
field_name, field_type, field_description, default_value = field
|
|
24
|
+
json_structure_definition += f' "{field_name}": "({field_type}) <{field_description}, default: {default_value}>"'
|
|
25
|
+
if i < len(required_fields) - 1:
|
|
26
|
+
json_structure_definition += ",\n"
|
|
27
|
+
else:
|
|
28
|
+
json_structure_definition += "\n"
|
|
29
|
+
json_structure_definition += "}"
|
|
30
|
+
|
|
31
|
+
# Construct the full query
|
|
32
|
+
query = (
|
|
33
|
+
"Extract the following information from the above provided context and return it as a JSON object. "
|
|
34
|
+
"Ensure the output strictly conforms to the JSON format. "
|
|
35
|
+
"Use the default values if the information is not found in the text.\n"
|
|
36
|
+
"The required JSON structure is:\n"
|
|
37
|
+
f"{json_structure_definition}\n\n"
|
|
38
|
+
"JSON Output:"
|
|
39
|
+
)
|
|
40
|
+
return query
|
|
41
|
+
|
|
42
|
+
def save_to_json_file(llm_output: str, filepath: Path):
|
|
43
|
+
"""
|
|
44
|
+
Takes JSON string and puts it in a .json file
|
|
45
|
+
"""
|
|
46
|
+
try:
|
|
47
|
+
# Ensure the directory exists
|
|
48
|
+
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
49
|
+
|
|
50
|
+
# Attempt to parse the string to validate it's JSON
|
|
51
|
+
# and to get a nicely formatted string for the file.
|
|
52
|
+
json_data = json.loads(llm_output)
|
|
53
|
+
with open(filepath, 'w', encoding='utf-8') as f:
|
|
54
|
+
json.dump(json_data, f, indent=4, ensure_ascii=False)
|
|
55
|
+
print(f"Successfully saved JSON to {filepath}")
|
|
56
|
+
return json_data
|
|
57
|
+
except json.JSONDecodeError:
|
|
58
|
+
print(f"Error: LLM output is not valid JSON. Could not save to {filepath}")
|
|
59
|
+
print("LLM Output was:\n", llm_output)
|
|
60
|
+
# Optionally, save the raw invalid output for debugging
|
|
61
|
+
# raw_output_path = filepath.with_suffix('.raw_llm_output.txt')
|
|
62
|
+
# with open(raw_output_path, 'w', encoding='utf-8') as f:
|
|
63
|
+
# f.write(llm_output)
|
|
64
|
+
# print(f"Raw LLM output saved to {raw_output_path}")
|
|
65
|
+
except IOError as e:
|
|
66
|
+
print(f"Error saving file to {filepath}: {e}")
|
|
67
|
+
except Exception as e:
|
|
68
|
+
print(f"An unexpected error occurred in save_to_json_file: {e}")
|
|
69
|
+
|
|
70
|
+
def make_document_summarization_and_embeddings(db_client, llm_client, inference_client, docs_dir: Path, text_out_dir: Path, required_fields: list[tuple[str, str, str, str]], chunk_size: int = 600, embedding_model: str= 'nomic-embed-text:v1.5', llm_provider_name: PROVIDER_T= "novita", model_name= "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", embeddings_collection: str= "doc_chunks", index_name= "test_search"):
|
|
71
|
+
# give required output fields
|
|
72
|
+
# take the documents
|
|
73
|
+
# convert to text
|
|
74
|
+
# upload to chromadb
|
|
75
|
+
# query the required fields
|
|
76
|
+
|
|
77
|
+
for docfile in docs_dir.iterdir():
|
|
78
|
+
|
|
79
|
+
file_root= os.path.splitext(docfile.name)[0]
|
|
80
|
+
expected_json= text_out_dir / 'json' / (file_root + '.json')
|
|
81
|
+
document_id= make_it_an_id(file_root)
|
|
82
|
+
|
|
83
|
+
if docfile.is_file and not (expected_json).exists():
|
|
84
|
+
|
|
85
|
+
expected_markdown= text_out_dir / (file_root + '.md')
|
|
86
|
+
if not (expected_markdown).exists():
|
|
87
|
+
converted= Convertor(docfile, text_out_dir)
|
|
88
|
+
print("markdown made.", text_out_dir)
|
|
89
|
+
|
|
90
|
+
# the case that convertor() made the json
|
|
91
|
+
if expected_json.exists():
|
|
92
|
+
continue
|
|
93
|
+
|
|
94
|
+
# text files will be chunked and stored in separate persistent vector collections
|
|
95
|
+
process_document_and_embed(db_client, llm_client, inference_client, expected_markdown, chunk_size= chunk_size, embedding_model= embedding_model, embeddings_collection_name= embeddings_collection)
|
|
96
|
+
print("its in the db now")
|
|
97
|
+
|
|
98
|
+
query= make_query(required_fields)
|
|
99
|
+
print("full query is: ")
|
|
100
|
+
print(query)
|
|
101
|
+
filename_string= file_root + '.json'
|
|
102
|
+
required_fields_descriptions= list(map(lambda x: x[1], required_fields))
|
|
103
|
+
llm_output= rag_query_llm(db_client, llm_client, inference_client, query, document_id, required_fields_descriptions, no_of_fields= len(required_fields), llm_provider_name= llm_provider_name, model_name= model_name, embedding_model= embedding_model, embeddings_collection= embeddings_collection, index_name= index_name)
|
|
104
|
+
|
|
105
|
+
# llm_output= rag_query_llm(query, no_of_fields= len(required_fields))
|
|
106
|
+
jsonstr= remove_bs(llm_output)
|
|
107
|
+
|
|
108
|
+
save_to_json_file(jsonstr, text_out_dir / 'json' / filename_string)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def upload_summaries(json_dir: Path, docs_collection):
|
|
112
|
+
|
|
113
|
+
for json_path in json_dir.iterdir():
|
|
114
|
+
|
|
115
|
+
base_name, _ = os.path.splitext(json_path.name)
|
|
116
|
+
corresponding_text_file= json_dir.parent / (base_name + ".md")
|
|
117
|
+
document_name_id= make_it_an_id(base_name)
|
|
118
|
+
|
|
119
|
+
with open(str(json_path)) as json_file:
|
|
120
|
+
json_data= json.load(json_file)
|
|
121
|
+
|
|
122
|
+
print("pushing doc: ", corresponding_text_file, json_path)
|
|
123
|
+
result = docs_collection.update_one(
|
|
124
|
+
{"document_name_id": document_name_id}, # Filter by the content field
|
|
125
|
+
{"$setOnInsert": {**json_data, "document_name_id": document_name_id}}, # Set these fields ONLY on insert
|
|
126
|
+
upsert=True # Insert if no matching document is found
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
if result.upserted_id:
|
|
130
|
+
print(f"New Document inserted with _id: {result.upserted_id}")
|
|
131
|
+
else:
|
|
132
|
+
print("Document with this docId found. Updated.")
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def initit(db_client, llm_client, inference_client, chunk_size= 500, embedding_model= "BAAI/bge-en-icl", llm_provider_name: PROVIDER_T= "novita", model_name= "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", embeddings_collection: str= "doc_chunks", index_name= "test_search"):
|
|
137
|
+
|
|
138
|
+
local_project_files_dir= Path.cwd().parent
|
|
139
|
+
docs= local_project_files_dir / 'documents'
|
|
140
|
+
text_out= local_project_files_dir / 'text-outputs'
|
|
141
|
+
|
|
142
|
+
docs.mkdir(parents= True, exist_ok= True)
|
|
143
|
+
text_out.mkdir(parents= True, exist_ok= True)
|
|
144
|
+
|
|
145
|
+
make_document_summarization_and_embeddings(db_client, llm_client, inference_client, docs, text_out, required_fields, chunk_size= chunk_size, embedding_model= embedding_model, llm_provider_name= llm_provider_name, model_name= model_name, embeddings_collection= embeddings_collection, index_name= index_name)
|
|
146
|
+
|
|
147
|
+
return text_out
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
if __name__ == "__main__":
|
|
151
|
+
|
|
152
|
+
mongodb_uri= os.environ['MONGODB_PEMBOT']
|
|
153
|
+
mc = MongoClient(mongodb_uri)
|
|
154
|
+
|
|
155
|
+
llm_client= ollama.Client()
|
|
156
|
+
|
|
157
|
+
#### FOR USING JINA INSTEAD OF HUGGINGFACE SDK, REPLACE WITH THE InferenceClient TOP IMPORT
|
|
158
|
+
# from pembot.utils.inference_client import InferenceClient
|
|
159
|
+
# JINA_API_KEY= os.environ['JINA_API_KEY']
|
|
160
|
+
# inference_client= InferenceClient(
|
|
161
|
+
# provider="Jina AI",
|
|
162
|
+
# api_key= JINA_API_KEY,
|
|
163
|
+
# )
|
|
164
|
+
|
|
165
|
+
inference_client= InferenceClient(
|
|
166
|
+
provider="hf-inference",
|
|
167
|
+
api_key= os.environ["HF_TOKEN"],
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
mc.admin.command('ping')
|
|
171
|
+
print("ping test ok")
|
|
172
|
+
database = mc["pembot"]
|
|
173
|
+
print("dbs and cols loaded")
|
|
174
|
+
|
|
175
|
+
embeddings_collection: str= "doc_chunks"
|
|
176
|
+
|
|
177
|
+
# if you want to use LLM inference from a different provider than embeddings
|
|
178
|
+
llm_provider_name: PROVIDER_T="nebius"
|
|
179
|
+
|
|
180
|
+
# nerfed, but provided by hf serverless inference: BAAI/bge-small-en-v1.5
|
|
181
|
+
# Worth mentioning:
|
|
182
|
+
# jinaai/jina-embeddings-v3
|
|
183
|
+
# BAAI/bge-base-en-v1.5
|
|
184
|
+
# nomic-ai/nomic-embed-text-v1.5
|
|
185
|
+
# embedding_model: str= 'BAAI/bge-base-en-v1.5'
|
|
186
|
+
embedding_model: str= 'gemini-embedding-exp-03-07'
|
|
187
|
+
# model_name: str= "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B"
|
|
188
|
+
# model_name: str= "google/gemma-3-27b-it"
|
|
189
|
+
model_name: str= "gemini-2.5-flash-preview-05-20"
|
|
190
|
+
|
|
191
|
+
index_name: str= "gemini_vectors"
|
|
192
|
+
|
|
193
|
+
# output tokens are ~1000 at max
|
|
194
|
+
# chunk_size= 1000 # this is for chhote mote models, gemma 1b or smth
|
|
195
|
+
chunk_size= int(2_50_000 / len(required_fields)) # we got 63k tokens => ~2.5 lac characters
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
#### REQUIRED_FIELDS:
|
|
199
|
+
# an array of tuples:
|
|
200
|
+
# (field name, field description, field type, default value)
|
|
201
|
+
|
|
202
|
+
process_output_dir= initit(database, llm_client, inference_client, chunk_size= chunk_size, embedding_model= embedding_model, llm_provider_name= llm_provider_name, model_name= model_name, embeddings_collection= embeddings_collection, index_name= index_name)
|
|
203
|
+
|
|
204
|
+
docs_collection= database["summary_docs"]
|
|
205
|
+
upload_summaries(process_output_dir / 'json', docs_collection)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
from llama_cpp import Llama
|
|
2
|
+
from huggingface_hub import hf_hub_download
|
|
3
|
+
from lmformatenforcer import JsonSchemaParser
|
|
4
|
+
from lmformatenforcer.integrations.llamacpp import build_llamacpp_logits_processor
|
|
5
|
+
from llama_cpp import LogitsProcessorList
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
def generate_structured_data(input_data: str, pydantic_class: BaseModel, model_path: str = "TheBloke/Llama-2-7b-Chat-GGUF") -> dict:
|
|
9
|
+
"""
|
|
10
|
+
Generate structured data according to a Pydantic class.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
- input_data (str): The input data to be processed.
|
|
14
|
+
- pydantic_class (BaseModel): The Pydantic class to structure the data.
|
|
15
|
+
- model_path (str): The path to the Llama model.
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
- dict: The structured data according to the Pydantic class.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
# Download the Llama model if it doesn't exist
|
|
22
|
+
downloaded_model_path = hf_hub_download(repo_id=model_path, filename="llama-2-7b-chat.Q5_K_M.gguf")
|
|
23
|
+
|
|
24
|
+
# Initialize the Llama model
|
|
25
|
+
llm = Llama(model_path=downloaded_model_path)
|
|
26
|
+
|
|
27
|
+
# Define the system prompt
|
|
28
|
+
DEFAULT_SYSTEM_PROMPT = """\
|
|
29
|
+
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
# Define the function to get the prompt
|
|
33
|
+
def get_prompt(message: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
|
|
34
|
+
return f'<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n{message} [/INST]'
|
|
35
|
+
|
|
36
|
+
# Define the function to generate structured data
|
|
37
|
+
def llamacpp_with_json_schema_parser(llm: Llama, prompt: str, json_schema_parser: JsonSchemaParser) -> str:
|
|
38
|
+
logits_processors: LogitsProcessorList = LogitsProcessorList([build_llamacpp_logits_processor(llm, json_schema_parser)])
|
|
39
|
+
output = llm(prompt, logits_processor=logits_processors)
|
|
40
|
+
text: str = output['choices'][0]['text']
|
|
41
|
+
return text
|
|
42
|
+
|
|
43
|
+
# Define the question and prompt
|
|
44
|
+
question = f'Please generate structured data according to the following JSON schema: {pydantic_class.schema_json()}. The input data is: {input_data}'
|
|
45
|
+
prompt = get_prompt(question)
|
|
46
|
+
|
|
47
|
+
# Generate the structured data
|
|
48
|
+
json_schema_parser = JsonSchemaParser(pydantic_class.schema())
|
|
49
|
+
result = llamacpp_with_json_schema_parser(llm, prompt, json_schema_parser)
|
|
50
|
+
|
|
51
|
+
# Return the structured data as a dictionary
|
|
52
|
+
return eval(result)
|
|
53
|
+
|
|
54
|
+
# Example usage:
|
|
55
|
+
class PlanetSchema(BaseModel):
|
|
56
|
+
planet_name: str
|
|
57
|
+
|
|
58
|
+
class PlanetList(BaseModel):
|
|
59
|
+
planets: list[PlanetSchema]
|
|
60
|
+
|
|
61
|
+
input_data = "Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune"
|
|
62
|
+
result = generate_structured_data(input_data, PlanetList)
|
|
63
|
+
print(result)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ref: refs/heads/main
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
[core]
|
|
2
|
+
repositoryformatversion = 0
|
|
3
|
+
filemode = true
|
|
4
|
+
bare = false
|
|
5
|
+
logallrefupdates = true
|
|
6
|
+
[remote "origin"]
|
|
7
|
+
url = https://github.com/iamarunbrahma/pdf-to-markdown
|
|
8
|
+
fetch = +refs/heads/*:refs/remotes/origin/*
|
|
9
|
+
[branch "main"]
|
|
10
|
+
remote = origin
|
|
11
|
+
merge = refs/heads/main
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
Unnamed repository; edit this file 'description' to name the repository.
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
#!/bin/sh
|
|
2
|
+
#
|
|
3
|
+
# An example hook script to check the commit log message taken by
|
|
4
|
+
# applypatch from an e-mail message.
|
|
5
|
+
#
|
|
6
|
+
# The hook should exit with non-zero status after issuing an
|
|
7
|
+
# appropriate message if it wants to stop the commit. The hook is
|
|
8
|
+
# allowed to edit the commit message file.
|
|
9
|
+
#
|
|
10
|
+
# To enable this hook, rename this file to "applypatch-msg".
|
|
11
|
+
|
|
12
|
+
. git-sh-setup
|
|
13
|
+
commitmsg="$(git rev-parse --git-path hooks/commit-msg)"
|
|
14
|
+
test -x "$commitmsg" && exec "$commitmsg" ${1+"$@"}
|
|
15
|
+
:
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
#!/bin/sh
|
|
2
|
+
#
|
|
3
|
+
# An example hook script to check the commit log message.
|
|
4
|
+
# Called by "git commit" with one argument, the name of the file
|
|
5
|
+
# that has the commit message. The hook should exit with non-zero
|
|
6
|
+
# status after issuing an appropriate message if it wants to stop the
|
|
7
|
+
# commit. The hook is allowed to edit the commit message file.
|
|
8
|
+
#
|
|
9
|
+
# To enable this hook, rename this file to "commit-msg".
|
|
10
|
+
|
|
11
|
+
# Uncomment the below to add a Signed-off-by line to the message.
|
|
12
|
+
# Doing this in a hook is a bad idea in general, but the prepare-commit-msg
|
|
13
|
+
# hook is more suited to it.
|
|
14
|
+
#
|
|
15
|
+
# SOB=$(git var GIT_AUTHOR_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p')
|
|
16
|
+
# grep -qs "^$SOB" "$1" || echo "$SOB" >> "$1"
|
|
17
|
+
|
|
18
|
+
# This example catches duplicate Signed-off-by lines.
|
|
19
|
+
|
|
20
|
+
test "" = "$(grep '^Signed-off-by: ' "$1" |
|
|
21
|
+
sort | uniq -c | sed -e '/^[ ]*1[ ]/d')" || {
|
|
22
|
+
echo >&2 Duplicate Signed-off-by lines.
|
|
23
|
+
exit 1
|
|
24
|
+
}
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
#!/usr/bin/perl
|
|
2
|
+
|
|
3
|
+
use strict;
|
|
4
|
+
use warnings;
|
|
5
|
+
use IPC::Open2;
|
|
6
|
+
|
|
7
|
+
# An example hook script to integrate Watchman
|
|
8
|
+
# (https://facebook.github.io/watchman/) with git to speed up detecting
|
|
9
|
+
# new and modified files.
|
|
10
|
+
#
|
|
11
|
+
# The hook is passed a version (currently 2) and last update token
|
|
12
|
+
# formatted as a string and outputs to stdout a new update token and
|
|
13
|
+
# all files that have been modified since the update token. Paths must
|
|
14
|
+
# be relative to the root of the working tree and separated by a single NUL.
|
|
15
|
+
#
|
|
16
|
+
# To enable this hook, rename this file to "query-watchman" and set
|
|
17
|
+
# 'git config core.fsmonitor .git/hooks/query-watchman'
|
|
18
|
+
#
|
|
19
|
+
my ($version, $last_update_token) = @ARGV;
|
|
20
|
+
|
|
21
|
+
# Uncomment for debugging
|
|
22
|
+
# print STDERR "$0 $version $last_update_token\n";
|
|
23
|
+
|
|
24
|
+
# Check the hook interface version
|
|
25
|
+
if ($version ne 2) {
|
|
26
|
+
die "Unsupported query-fsmonitor hook version '$version'.\n" .
|
|
27
|
+
"Falling back to scanning...\n";
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
my $git_work_tree = get_working_dir();
|
|
31
|
+
|
|
32
|
+
my $retry = 1;
|
|
33
|
+
|
|
34
|
+
my $json_pkg;
|
|
35
|
+
eval {
|
|
36
|
+
require JSON::XS;
|
|
37
|
+
$json_pkg = "JSON::XS";
|
|
38
|
+
1;
|
|
39
|
+
} or do {
|
|
40
|
+
require JSON::PP;
|
|
41
|
+
$json_pkg = "JSON::PP";
|
|
42
|
+
};
|
|
43
|
+
|
|
44
|
+
launch_watchman();
|
|
45
|
+
|
|
46
|
+
sub launch_watchman {
|
|
47
|
+
my $o = watchman_query();
|
|
48
|
+
if (is_work_tree_watched($o)) {
|
|
49
|
+
output_result($o->{clock}, @{$o->{files}});
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
sub output_result {
|
|
54
|
+
my ($clockid, @files) = @_;
|
|
55
|
+
|
|
56
|
+
# Uncomment for debugging watchman output
|
|
57
|
+
# open (my $fh, ">", ".git/watchman-output.out");
|
|
58
|
+
# binmode $fh, ":utf8";
|
|
59
|
+
# print $fh "$clockid\n@files\n";
|
|
60
|
+
# close $fh;
|
|
61
|
+
|
|
62
|
+
binmode STDOUT, ":utf8";
|
|
63
|
+
print $clockid;
|
|
64
|
+
print "\0";
|
|
65
|
+
local $, = "\0";
|
|
66
|
+
print @files;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
sub watchman_clock {
|
|
70
|
+
my $response = qx/watchman clock "$git_work_tree"/;
|
|
71
|
+
die "Failed to get clock id on '$git_work_tree'.\n" .
|
|
72
|
+
"Falling back to scanning...\n" if $? != 0;
|
|
73
|
+
|
|
74
|
+
return $json_pkg->new->utf8->decode($response);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
sub watchman_query {
|
|
78
|
+
my $pid = open2(\*CHLD_OUT, \*CHLD_IN, 'watchman -j --no-pretty')
|
|
79
|
+
or die "open2() failed: $!\n" .
|
|
80
|
+
"Falling back to scanning...\n";
|
|
81
|
+
|
|
82
|
+
# In the query expression below we're asking for names of files that
|
|
83
|
+
# changed since $last_update_token but not from the .git folder.
|
|
84
|
+
#
|
|
85
|
+
# To accomplish this, we're using the "since" generator to use the
|
|
86
|
+
# recency index to select candidate nodes and "fields" to limit the
|
|
87
|
+
# output to file names only. Then we're using the "expression" term to
|
|
88
|
+
# further constrain the results.
|
|
89
|
+
my $last_update_line = "";
|
|
90
|
+
if (substr($last_update_token, 0, 1) eq "c") {
|
|
91
|
+
$last_update_token = "\"$last_update_token\"";
|
|
92
|
+
$last_update_line = qq[\n"since": $last_update_token,];
|
|
93
|
+
}
|
|
94
|
+
my $query = <<" END";
|
|
95
|
+
["query", "$git_work_tree", {$last_update_line
|
|
96
|
+
"fields": ["name"],
|
|
97
|
+
"expression": ["not", ["dirname", ".git"]]
|
|
98
|
+
}]
|
|
99
|
+
END
|
|
100
|
+
|
|
101
|
+
# Uncomment for debugging the watchman query
|
|
102
|
+
# open (my $fh, ">", ".git/watchman-query.json");
|
|
103
|
+
# print $fh $query;
|
|
104
|
+
# close $fh;
|
|
105
|
+
|
|
106
|
+
print CHLD_IN $query;
|
|
107
|
+
close CHLD_IN;
|
|
108
|
+
my $response = do {local $/; <CHLD_OUT>};
|
|
109
|
+
|
|
110
|
+
# Uncomment for debugging the watch response
|
|
111
|
+
# open ($fh, ">", ".git/watchman-response.json");
|
|
112
|
+
# print $fh $response;
|
|
113
|
+
# close $fh;
|
|
114
|
+
|
|
115
|
+
die "Watchman: command returned no output.\n" .
|
|
116
|
+
"Falling back to scanning...\n" if $response eq "";
|
|
117
|
+
die "Watchman: command returned invalid output: $response\n" .
|
|
118
|
+
"Falling back to scanning...\n" unless $response =~ /^\{/;
|
|
119
|
+
|
|
120
|
+
return $json_pkg->new->utf8->decode($response);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
sub is_work_tree_watched {
|
|
124
|
+
my ($output) = @_;
|
|
125
|
+
my $error = $output->{error};
|
|
126
|
+
if ($retry > 0 and $error and $error =~ m/unable to resolve root .* directory (.*) is not watched/) {
|
|
127
|
+
$retry--;
|
|
128
|
+
my $response = qx/watchman watch "$git_work_tree"/;
|
|
129
|
+
die "Failed to make watchman watch '$git_work_tree'.\n" .
|
|
130
|
+
"Falling back to scanning...\n" if $? != 0;
|
|
131
|
+
$output = $json_pkg->new->utf8->decode($response);
|
|
132
|
+
$error = $output->{error};
|
|
133
|
+
die "Watchman: $error.\n" .
|
|
134
|
+
"Falling back to scanning...\n" if $error;
|
|
135
|
+
|
|
136
|
+
# Uncomment for debugging watchman output
|
|
137
|
+
# open (my $fh, ">", ".git/watchman-output.out");
|
|
138
|
+
# close $fh;
|
|
139
|
+
|
|
140
|
+
# Watchman will always return all files on the first query so
|
|
141
|
+
# return the fast "everything is dirty" flag to git and do the
|
|
142
|
+
# Watchman query just to get it over with now so we won't pay
|
|
143
|
+
# the cost in git to look up each individual file.
|
|
144
|
+
my $o = watchman_clock();
|
|
145
|
+
$error = $output->{error};
|
|
146
|
+
|
|
147
|
+
die "Watchman: $error.\n" .
|
|
148
|
+
"Falling back to scanning...\n" if $error;
|
|
149
|
+
|
|
150
|
+
output_result($o->{clock}, ("/"));
|
|
151
|
+
$last_update_token = $o->{clock};
|
|
152
|
+
|
|
153
|
+
eval { launch_watchman() };
|
|
154
|
+
return 0;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
die "Watchman: $error.\n" .
|
|
158
|
+
"Falling back to scanning...\n" if $error;
|
|
159
|
+
|
|
160
|
+
return 1;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
sub get_working_dir {
|
|
164
|
+
my $working_dir;
|
|
165
|
+
if ($^O =~ 'msys' || $^O =~ 'cygwin') {
|
|
166
|
+
$working_dir = Win32::GetCwd();
|
|
167
|
+
$working_dir =~ tr/\\/\//;
|
|
168
|
+
} else {
|
|
169
|
+
require Cwd;
|
|
170
|
+
$working_dir = Cwd::cwd();
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
return $working_dir;
|
|
174
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
#!/bin/sh
|
|
2
|
+
#
|
|
3
|
+
# An example hook script to verify what is about to be committed
|
|
4
|
+
# by applypatch from an e-mail message.
|
|
5
|
+
#
|
|
6
|
+
# The hook should exit with non-zero status after issuing an
|
|
7
|
+
# appropriate message if it wants to stop the commit.
|
|
8
|
+
#
|
|
9
|
+
# To enable this hook, rename this file to "pre-applypatch".
|
|
10
|
+
|
|
11
|
+
. git-sh-setup
|
|
12
|
+
precommit="$(git rev-parse --git-path hooks/pre-commit)"
|
|
13
|
+
test -x "$precommit" && exec "$precommit" ${1+"$@"}
|
|
14
|
+
:
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
#!/bin/sh
|
|
2
|
+
#
|
|
3
|
+
# An example hook script to verify what is about to be committed.
|
|
4
|
+
# Called by "git commit" with no arguments. The hook should
|
|
5
|
+
# exit with non-zero status after issuing an appropriate message if
|
|
6
|
+
# it wants to stop the commit.
|
|
7
|
+
#
|
|
8
|
+
# To enable this hook, rename this file to "pre-commit".
|
|
9
|
+
|
|
10
|
+
if git rev-parse --verify HEAD >/dev/null 2>&1
|
|
11
|
+
then
|
|
12
|
+
against=HEAD
|
|
13
|
+
else
|
|
14
|
+
# Initial commit: diff against an empty tree object
|
|
15
|
+
against=$(git hash-object -t tree /dev/null)
|
|
16
|
+
fi
|
|
17
|
+
|
|
18
|
+
# If you want to allow non-ASCII filenames set this variable to true.
|
|
19
|
+
allownonascii=$(git config --type=bool hooks.allownonascii)
|
|
20
|
+
|
|
21
|
+
# Redirect output to stderr.
|
|
22
|
+
exec 1>&2
|
|
23
|
+
|
|
24
|
+
# Cross platform projects tend to avoid non-ASCII filenames; prevent
|
|
25
|
+
# them from being added to the repository. We exploit the fact that the
|
|
26
|
+
# printable range starts at the space character and ends with tilde.
|
|
27
|
+
if [ "$allownonascii" != "true" ] &&
|
|
28
|
+
# Note that the use of brackets around a tr range is ok here, (it's
|
|
29
|
+
# even required, for portability to Solaris 10's /usr/bin/tr), since
|
|
30
|
+
# the square bracket bytes happen to fall in the designated range.
|
|
31
|
+
test $(git diff-index --cached --name-only --diff-filter=A -z $against |
|
|
32
|
+
LC_ALL=C tr -d '[ -~]\0' | wc -c) != 0
|
|
33
|
+
then
|
|
34
|
+
cat <<\EOF
|
|
35
|
+
Error: Attempt to add a non-ASCII file name.
|
|
36
|
+
|
|
37
|
+
This can cause problems if you want to work with people on other platforms.
|
|
38
|
+
|
|
39
|
+
To be portable it is advisable to rename the file.
|
|
40
|
+
|
|
41
|
+
If you know what you are doing you can disable this check using:
|
|
42
|
+
|
|
43
|
+
git config hooks.allownonascii true
|
|
44
|
+
EOF
|
|
45
|
+
exit 1
|
|
46
|
+
fi
|
|
47
|
+
|
|
48
|
+
# If there are whitespace errors, print the offending file names and fail.
|
|
49
|
+
exec git diff-index --check --cached $against --
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
#!/bin/sh
|
|
2
|
+
#
|
|
3
|
+
# An example hook script to verify what is about to be committed.
|
|
4
|
+
# Called by "git merge" with no arguments. The hook should
|
|
5
|
+
# exit with non-zero status after issuing an appropriate message to
|
|
6
|
+
# stderr if it wants to stop the merge commit.
|
|
7
|
+
#
|
|
8
|
+
# To enable this hook, rename this file to "pre-merge-commit".
|
|
9
|
+
|
|
10
|
+
. git-sh-setup
|
|
11
|
+
test -x "$GIT_DIR/hooks/pre-commit" &&
|
|
12
|
+
exec "$GIT_DIR/hooks/pre-commit"
|
|
13
|
+
:
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
#!/bin/sh
|
|
2
|
+
|
|
3
|
+
# An example hook script to verify what is about to be pushed. Called by "git
|
|
4
|
+
# push" after it has checked the remote status, but before anything has been
|
|
5
|
+
# pushed. If this script exits with a non-zero status nothing will be pushed.
|
|
6
|
+
#
|
|
7
|
+
# This hook is called with the following parameters:
|
|
8
|
+
#
|
|
9
|
+
# $1 -- Name of the remote to which the push is being done
|
|
10
|
+
# $2 -- URL to which the push is being done
|
|
11
|
+
#
|
|
12
|
+
# If pushing without using a named remote those arguments will be equal.
|
|
13
|
+
#
|
|
14
|
+
# Information about the commits which are being pushed is supplied as lines to
|
|
15
|
+
# the standard input in the form:
|
|
16
|
+
#
|
|
17
|
+
# <local ref> <local oid> <remote ref> <remote oid>
|
|
18
|
+
#
|
|
19
|
+
# This sample shows how to prevent push of commits where the log message starts
|
|
20
|
+
# with "WIP" (work in progress).
|
|
21
|
+
|
|
22
|
+
remote="$1"
|
|
23
|
+
url="$2"
|
|
24
|
+
|
|
25
|
+
zero=$(git hash-object --stdin </dev/null | tr '[0-9a-f]' '0')
|
|
26
|
+
|
|
27
|
+
while read local_ref local_oid remote_ref remote_oid
|
|
28
|
+
do
|
|
29
|
+
if test "$local_oid" = "$zero"
|
|
30
|
+
then
|
|
31
|
+
# Handle delete
|
|
32
|
+
:
|
|
33
|
+
else
|
|
34
|
+
if test "$remote_oid" = "$zero"
|
|
35
|
+
then
|
|
36
|
+
# New branch, examine all commits
|
|
37
|
+
range="$local_oid"
|
|
38
|
+
else
|
|
39
|
+
# Update to existing branch, examine new commits
|
|
40
|
+
range="$remote_oid..$local_oid"
|
|
41
|
+
fi
|
|
42
|
+
|
|
43
|
+
# Check for WIP commit
|
|
44
|
+
commit=$(git rev-list -n 1 --grep '^WIP' "$range")
|
|
45
|
+
if test -n "$commit"
|
|
46
|
+
then
|
|
47
|
+
echo >&2 "Found WIP commit in $local_ref, not pushing"
|
|
48
|
+
exit 1
|
|
49
|
+
fi
|
|
50
|
+
fi
|
|
51
|
+
done
|
|
52
|
+
|
|
53
|
+
exit 0
|