pembot 0.0.3__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pembot might be problematic. Click here for more details.

Files changed (129) hide show
  1. pembot/.git/COMMIT_EDITMSG +1 -0
  2. pembot/.git/HEAD +1 -0
  3. pembot/.git/config +11 -0
  4. pembot/.git/description +1 -0
  5. pembot/.git/hooks/applypatch-msg.sample +15 -0
  6. pembot/.git/hooks/commit-msg.sample +24 -0
  7. pembot/.git/hooks/fsmonitor-watchman.sample +174 -0
  8. pembot/.git/hooks/post-update.sample +8 -0
  9. pembot/.git/hooks/pre-applypatch.sample +14 -0
  10. pembot/.git/hooks/pre-commit.sample +49 -0
  11. pembot/.git/hooks/pre-merge-commit.sample +13 -0
  12. pembot/.git/hooks/pre-push.sample +53 -0
  13. pembot/.git/hooks/pre-rebase.sample +169 -0
  14. pembot/.git/hooks/pre-receive.sample +24 -0
  15. pembot/.git/hooks/prepare-commit-msg.sample +42 -0
  16. pembot/.git/hooks/push-to-checkout.sample +78 -0
  17. pembot/.git/hooks/sendemail-validate.sample +77 -0
  18. pembot/.git/hooks/update.sample +128 -0
  19. pembot/.git/index +0 -0
  20. pembot/.git/info/exclude +6 -0
  21. pembot/.git/logs/HEAD +6 -0
  22. pembot/.git/logs/refs/heads/main +6 -0
  23. pembot/.git/logs/refs/remotes/origin/HEAD +1 -0
  24. pembot/.git/logs/refs/remotes/origin/main +5 -0
  25. pembot/.git/objects/0a/fb3a98cdc55b1434b44534ec2bf22c56cfa26c +0 -0
  26. pembot/.git/objects/0c/8d9b2690545bf1906b05cd9f18b783b3eb74f1 +0 -0
  27. pembot/.git/objects/18/28e18ab80aa64d334b26428708140e280cbc63 +0 -0
  28. pembot/.git/objects/19/f61df7dbd562d04f561288677bbf2f18f5dff7 +0 -0
  29. pembot/.git/objects/28/db0ab48059acccd7d257aa02e52e9b6b83a4a5 +0 -0
  30. pembot/.git/objects/35/97e518a8658280be9f377f78edf1dfa1f23814 +0 -0
  31. pembot/.git/objects/3d/07d3b29ff53d95de3898fb786d61732f210515 +0 -0
  32. pembot/.git/objects/3e/cf23eb95123287531d708a21d4ba88d92ccabb +0 -0
  33. pembot/.git/objects/3f/78215d7e17da726fb352fd92b3c117db9b63ba +0 -0
  34. pembot/.git/objects/3f/e072cf3cb6a9f30c3e9936e3ddf622e80270d0 +0 -0
  35. pembot/.git/objects/51/9e780574933d7627a083222bd10dd74f430904 +0 -0
  36. pembot/.git/objects/61/46a371b9c1bd9f51af273f11f986cfd1bedeba +0 -0
  37. pembot/.git/objects/64/00040794955d17c9a1fe1aaaea59f2c4822177 +0 -0
  38. pembot/.git/objects/6d/7a865a23b1cb4182f67907820104ced48b11c9 +0 -0
  39. pembot/.git/objects/72/f047cda92abcd1ddc857f6461de605f8668331 +0 -0
  40. pembot/.git/objects/73/2e98f08bc806c331b06847fc8c743f545499e5 +0 -0
  41. pembot/.git/objects/86/cdaec229f1fbebf43042266b03878944669f25 +0 -0
  42. pembot/.git/objects/87/d6df5217a4a374f8c1211a05f9bd657f72c9a7 +0 -0
  43. pembot/.git/objects/8b/5be2af9b16f290549193859c214cd9072212e8 +0 -0
  44. pembot/.git/objects/93/8f29d9b4b1ae86e39dddf9e3d115a82ddfc9b6 +0 -0
  45. pembot/.git/objects/9b/123713e30fc9e225f9ac8ff5b02f8f8cf86456 +0 -0
  46. pembot/.git/objects/ab/c6b15265171457b41e2cfdaf3b8c3994a59eb7 +0 -0
  47. pembot/.git/objects/ac/9c9018c62fa30dc142665c1b5a375f4e056880 +0 -0
  48. pembot/.git/objects/b1/1173d9b68db117437ccb9551461152e1e8a77d +0 -0
  49. pembot/.git/objects/b2/4e79ab07fe9e68781961a25ff9f1dbb1546fbb +0 -0
  50. pembot/.git/objects/b8/eea52176ffa4d88c5a9976bee26092421565d3 +0 -0
  51. pembot/.git/objects/bf/32a7e6872e5dc4025ee3df3c921ec7ade0855f +0 -0
  52. pembot/.git/objects/c0/793458db6e1bee7f79f1a504fb8ff4963f8ed3 +0 -0
  53. pembot/.git/objects/c2/443060c07101948487cfa93cc39e082e9e0f5f +0 -0
  54. pembot/.git/objects/e5/3070f2b07f45d031444b09b1b38658f3caf29e +0 -0
  55. pembot/.git/objects/e7/911a702079a6144997ea4e70f59abbe59ec2bc +0 -0
  56. pembot/.git/objects/e9/1172752e9a421ae463112d2b0506b37498c98d +0 -0
  57. pembot/.git/objects/ea/0af89e61a882c5afc2a8c281b2d96f174bfe58 +0 -0
  58. pembot/.git/objects/eb/75e1c49f1e5b79dca17ccdbec8067756523238 +0 -0
  59. pembot/.git/objects/f1/655afa1c5636c8d58969e3194bb770aefbc552 +0 -0
  60. pembot/.git/objects/f4/e991088a63def67a30a2b8bbdb4d58514abab8 +0 -0
  61. pembot/.git/objects/f8/cbb5bfd1503e66cec2c593362c60a317b6d300 +0 -0
  62. pembot/.git/objects/f9/98e1f01c2bf0a20159fc851327af05beb3ac88 +0 -0
  63. pembot/.git/objects/fa/9c9a62ec1203a5868b033ded428c2382c4e1b6 +0 -0
  64. pembot/.git/objects/fb/6c90c9ce5e0cdfbe074a3f060afc66f62eefde +0 -0
  65. pembot/.git/objects/fc/e56f1e09d09a05b9babf796fb40bece176f3a2 +0 -0
  66. pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.idx +0 -0
  67. pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.pack +0 -0
  68. pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.rev +0 -0
  69. pembot/.git/packed-refs +2 -0
  70. pembot/.git/refs/heads/main +1 -0
  71. pembot/.git/refs/remotes/origin/HEAD +1 -0
  72. pembot/.git/refs/remotes/origin/main +1 -0
  73. pembot/.gitignore +7 -0
  74. pembot/AnyToText/__init__.py +0 -0
  75. pembot/AnyToText/convertor.py +260 -0
  76. pembot/LICENSE +674 -0
  77. pembot/TextEmbedder/__init__.py +0 -0
  78. pembot/TextEmbedder/gemini_embedder.py +27 -0
  79. pembot/TextEmbedder/mongodb_embedder.py +258 -0
  80. pembot/TextEmbedder/mongodb_index_creator.py +133 -0
  81. pembot/TextEmbedder/vector_query.py +64 -0
  82. pembot/__init__.py +6 -0
  83. pembot/config/config.yaml +5 -0
  84. pembot/gartner.py +140 -0
  85. pembot/main.py +208 -0
  86. pembot/output_structure_local.py +63 -0
  87. pembot/pdf2markdown/.git/HEAD +1 -0
  88. pembot/pdf2markdown/.git/config +11 -0
  89. pembot/pdf2markdown/.git/description +1 -0
  90. pembot/pdf2markdown/.git/hooks/applypatch-msg.sample +15 -0
  91. pembot/pdf2markdown/.git/hooks/commit-msg.sample +24 -0
  92. pembot/pdf2markdown/.git/hooks/fsmonitor-watchman.sample +174 -0
  93. pembot/pdf2markdown/.git/hooks/post-update.sample +8 -0
  94. pembot/pdf2markdown/.git/hooks/pre-applypatch.sample +14 -0
  95. pembot/pdf2markdown/.git/hooks/pre-commit.sample +49 -0
  96. pembot/pdf2markdown/.git/hooks/pre-merge-commit.sample +13 -0
  97. pembot/pdf2markdown/.git/hooks/pre-push.sample +53 -0
  98. pembot/pdf2markdown/.git/hooks/pre-rebase.sample +169 -0
  99. pembot/pdf2markdown/.git/hooks/pre-receive.sample +24 -0
  100. pembot/pdf2markdown/.git/hooks/prepare-commit-msg.sample +42 -0
  101. pembot/pdf2markdown/.git/hooks/push-to-checkout.sample +78 -0
  102. pembot/pdf2markdown/.git/hooks/sendemail-validate.sample +77 -0
  103. pembot/pdf2markdown/.git/hooks/update.sample +128 -0
  104. pembot/pdf2markdown/.git/index +0 -0
  105. pembot/pdf2markdown/.git/info/exclude +6 -0
  106. pembot/pdf2markdown/.git/logs/HEAD +1 -0
  107. pembot/pdf2markdown/.git/logs/refs/heads/main +1 -0
  108. pembot/pdf2markdown/.git/logs/refs/remotes/origin/HEAD +1 -0
  109. pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.idx +0 -0
  110. pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.pack +0 -0
  111. pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.rev +0 -0
  112. pembot/pdf2markdown/.git/packed-refs +2 -0
  113. pembot/pdf2markdown/.git/refs/heads/main +1 -0
  114. pembot/pdf2markdown/.git/refs/remotes/origin/HEAD +1 -0
  115. pembot/pdf2markdown/LICENSE +21 -0
  116. pembot/pdf2markdown/README.md +107 -0
  117. pembot/pdf2markdown/__init__.py +0 -0
  118. pembot/pdf2markdown/config/config.yaml +2 -0
  119. pembot/pdf2markdown/extract.py +888 -0
  120. pembot/pdf2markdown/requirements.txt +8 -0
  121. pembot/pem.py +157 -0
  122. pembot/query.py +204 -0
  123. pembot/utils/__init__.py +0 -0
  124. pembot/utils/inference_client.py +132 -0
  125. pembot/utils/string_tools.py +45 -0
  126. pembot-0.0.3.dist-info/METADATA +8 -0
  127. pembot-0.0.3.dist-info/RECORD +129 -0
  128. pembot-0.0.3.dist-info/WHEEL +5 -0
  129. pembot-0.0.3.dist-info/licenses/LICENSE +674 -0
pembot/main.py ADDED
@@ -0,0 +1,208 @@
1
+ from pathlib import Path
2
+
3
+ from huggingface_hub import InferenceClient
4
+ from huggingface_hub.inference._providers import PROVIDER_T
5
+ import ollama
6
+ from pymongo import MongoClient
7
+ from pembot.AnyToText.convertor import Convertor
8
+ from pembot.TextEmbedder.mongodb_embedder import process_document_and_embed
9
+ from pembot.query import rag_query_llm, remove_bs
10
+ import os
11
+ import json
12
+ from pembot.utils.string_tools import make_it_an_id
13
+ from schema.structure import required_fields
14
+
15
+
16
+ def make_query(required_fields: list[tuple[str, str, str, str]]):
17
+ """
18
+ Makes a query to get json in the form of required fields
19
+ """
20
+ # Construct the part of the prompt that defines the desired JSON structure
21
+ json_structure_definition = "{\n"
22
+ for i, field in enumerate(required_fields):
23
+ field_name, field_type, field_description, default_value = field
24
+ json_structure_definition += f' "{field_name}": "({field_type}) <{field_description}, default: {default_value}>"'
25
+ if i < len(required_fields) - 1:
26
+ json_structure_definition += ",\n"
27
+ else:
28
+ json_structure_definition += "\n"
29
+ json_structure_definition += "}"
30
+
31
+ # Construct the full query
32
+ query = (
33
+ "Extract the following information from the above provided context and return it as a JSON object. "
34
+ "Ensure the output strictly conforms to the JSON format. "
35
+ "Use the default values if the information is not found in the text.\n"
36
+ "The required JSON structure is:\n"
37
+ f"{json_structure_definition}\n\n"
38
+ "JSON Output:"
39
+ )
40
+ return query
41
+
42
+ def save_to_json_file(llm_output: str, filepath: Path):
43
+ """
44
+ Takes JSON string and puts it in a .json file
45
+ """
46
+ try:
47
+ # Ensure the directory exists
48
+ filepath.parent.mkdir(parents=True, exist_ok=True)
49
+
50
+ # Attempt to parse the string to validate it's JSON
51
+ # and to get a nicely formatted string for the file.
52
+ json_data = json.loads(llm_output)
53
+ with open(filepath, 'w', encoding='utf-8') as f:
54
+ json.dump(json_data, f, indent=4, ensure_ascii=False)
55
+ print(f"Successfully saved JSON to {filepath}")
56
+ return json_data
57
+ except json.JSONDecodeError:
58
+ print(f"Error: LLM output is not valid JSON. Could not save to {filepath}")
59
+ print("LLM Output was:\n", llm_output)
60
+ # Optionally, save the raw invalid output for debugging
61
+ # raw_output_path = filepath.with_suffix('.raw_llm_output.txt')
62
+ # with open(raw_output_path, 'w', encoding='utf-8') as f:
63
+ # f.write(llm_output)
64
+ # print(f"Raw LLM output saved to {raw_output_path}")
65
+ except IOError as e:
66
+ print(f"Error saving file to {filepath}: {e}")
67
+ except Exception as e:
68
+ print(f"An unexpected error occurred in save_to_json_file: {e}")
69
+
70
+ def make_document_summarization_and_embeddings(db_client, llm_client, inference_client, docs_dir: Path, text_out_dir: Path, required_fields: list[tuple[str, str, str, str]], chunk_size: int = 600, embedding_model: str= 'nomic-embed-text:v1.5', llm_provider_name: PROVIDER_T= "novita", model_name= "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", embeddings_collection: str= "doc_chunks", index_name= "test_search"):
71
+ # give required output fields
72
+ # take the documents
73
+ # convert to text
74
+ # upload to chromadb
75
+ # query the required fields
76
+
77
+ for docfile in docs_dir.iterdir():
78
+
79
+ file_root= os.path.splitext(docfile.name)[0]
80
+ expected_json= text_out_dir / 'json' / (file_root + '.json')
81
+ document_id= make_it_an_id(file_root)
82
+
83
+ if docfile.is_file and not (expected_json).exists():
84
+
85
+ expected_markdown= text_out_dir / (file_root + '.md')
86
+ if not (expected_markdown).exists():
87
+ converted= Convertor(docfile, text_out_dir)
88
+ print("markdown made.", text_out_dir)
89
+
90
+ # the case that convertor() made the json
91
+ if expected_json.exists():
92
+ continue
93
+
94
+ # text files will be chunked and stored in separate persistent vector collections
95
+ process_document_and_embed(db_client, llm_client, inference_client, expected_markdown, chunk_size= chunk_size, embedding_model= embedding_model, embeddings_collection_name= embeddings_collection)
96
+ print("its in the db now")
97
+
98
+ query= make_query(required_fields)
99
+ print("full query is: ")
100
+ print(query)
101
+ filename_string= file_root + '.json'
102
+ required_fields_descriptions= list(map(lambda x: x[1], required_fields))
103
+ llm_output= rag_query_llm(db_client, llm_client, inference_client, query, document_id, required_fields_descriptions, no_of_fields= len(required_fields), llm_provider_name= llm_provider_name, model_name= model_name, embedding_model= embedding_model, embeddings_collection= embeddings_collection, index_name= index_name)
104
+
105
+ # llm_output= rag_query_llm(query, no_of_fields= len(required_fields))
106
+ jsonstr= remove_bs(llm_output)
107
+
108
+ save_to_json_file(jsonstr, text_out_dir / 'json' / filename_string)
109
+
110
+
111
+ def upload_summaries(json_dir: Path, docs_collection):
112
+
113
+ for json_path in json_dir.iterdir():
114
+
115
+ base_name, _ = os.path.splitext(json_path.name)
116
+ corresponding_text_file= json_dir.parent / (base_name + ".md")
117
+ document_name_id= make_it_an_id(base_name)
118
+
119
+ with open(str(json_path)) as json_file:
120
+ json_data= json.load(json_file)
121
+
122
+ print("pushing doc: ", corresponding_text_file, json_path)
123
+ result = docs_collection.update_one(
124
+ {"document_name_id": document_name_id}, # Filter by the content field
125
+ {"$setOnInsert": {**json_data, "document_name_id": document_name_id}}, # Set these fields ONLY on insert
126
+ upsert=True # Insert if no matching document is found
127
+ )
128
+
129
+ if result.upserted_id:
130
+ print(f"New Document inserted with _id: {result.upserted_id}")
131
+ else:
132
+ print("Document with this docId found. Updated.")
133
+
134
+
135
+
136
+ def initit(db_client, llm_client, inference_client, chunk_size= 500, embedding_model= "BAAI/bge-en-icl", llm_provider_name: PROVIDER_T= "novita", model_name= "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", embeddings_collection: str= "doc_chunks", index_name= "test_search"):
137
+
138
+ local_project_files_dir= Path.cwd().parent
139
+ docs= local_project_files_dir / 'documents'
140
+ text_out= local_project_files_dir / 'text-outputs'
141
+
142
+ docs.mkdir(parents= True, exist_ok= True)
143
+ text_out.mkdir(parents= True, exist_ok= True)
144
+
145
+ make_document_summarization_and_embeddings(db_client, llm_client, inference_client, docs, text_out, required_fields, chunk_size= chunk_size, embedding_model= embedding_model, llm_provider_name= llm_provider_name, model_name= model_name, embeddings_collection= embeddings_collection, index_name= index_name)
146
+
147
+ return text_out
148
+
149
+
150
+ if __name__ == "__main__":
151
+
152
+ mongodb_uri= os.environ['MONGODB_PEMBOT']
153
+ mc = MongoClient(mongodb_uri)
154
+
155
+ llm_client= ollama.Client()
156
+
157
+ #### FOR USING JINA INSTEAD OF HUGGINGFACE SDK, REPLACE WITH THE InferenceClient TOP IMPORT
158
+ # from pembot.utils.inference_client import InferenceClient
159
+ # JINA_API_KEY= os.environ['JINA_API_KEY']
160
+ # inference_client= InferenceClient(
161
+ # provider="Jina AI",
162
+ # api_key= JINA_API_KEY,
163
+ # )
164
+
165
+ inference_client= InferenceClient(
166
+ provider="hf-inference",
167
+ api_key= os.environ["HF_TOKEN"],
168
+ )
169
+
170
+ mc.admin.command('ping')
171
+ print("ping test ok")
172
+ database = mc["pembot"]
173
+ print("dbs and cols loaded")
174
+
175
+ embeddings_collection: str= "doc_chunks"
176
+
177
+ # if you want to use LLM inference from a different provider than embeddings
178
+ llm_provider_name: PROVIDER_T="nebius"
179
+
180
+ # nerfed, but provided by hf serverless inference: BAAI/bge-small-en-v1.5
181
+ # Worth mentioning:
182
+ # jinaai/jina-embeddings-v3
183
+ # BAAI/bge-base-en-v1.5
184
+ # nomic-ai/nomic-embed-text-v1.5
185
+ # embedding_model: str= 'BAAI/bge-base-en-v1.5'
186
+ embedding_model: str= 'gemini-embedding-exp-03-07'
187
+ # model_name: str= "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B"
188
+ # model_name: str= "google/gemma-3-27b-it"
189
+ model_name: str= "gemini-2.5-flash-preview-05-20"
190
+
191
+ index_name: str= "gemini_vectors"
192
+
193
+ # output tokens are ~1000 at max
194
+ # chunk_size= 1000 # this is for chhote mote models, gemma 1b or smth
195
+ chunk_size= int(2_50_000 / len(required_fields)) # we got 63k tokens => ~2.5 lac characters
196
+
197
+
198
+ #### REQUIRED_FIELDS:
199
+ # an array of tuples:
200
+ # (field name, field description, field type, default value)
201
+
202
+ process_output_dir= initit(database, llm_client, inference_client, chunk_size= chunk_size, embedding_model= embedding_model, llm_provider_name= llm_provider_name, model_name= model_name, embeddings_collection= embeddings_collection, index_name= index_name)
203
+
204
+ docs_collection= database["summary_docs"]
205
+ upload_summaries(process_output_dir / 'json', docs_collection)
206
+
207
+
208
+
@@ -0,0 +1,63 @@
1
+ from llama_cpp import Llama
2
+ from huggingface_hub import hf_hub_download
3
+ from lmformatenforcer import JsonSchemaParser
4
+ from lmformatenforcer.integrations.llamacpp import build_llamacpp_logits_processor
5
+ from llama_cpp import LogitsProcessorList
6
+ from pydantic import BaseModel
7
+
8
+ def generate_structured_data(input_data: str, pydantic_class: BaseModel, model_path: str = "TheBloke/Llama-2-7b-Chat-GGUF") -> dict:
9
+ """
10
+ Generate structured data according to a Pydantic class.
11
+
12
+ Args:
13
+ - input_data (str): The input data to be processed.
14
+ - pydantic_class (BaseModel): The Pydantic class to structure the data.
15
+ - model_path (str): The path to the Llama model.
16
+
17
+ Returns:
18
+ - dict: The structured data according to the Pydantic class.
19
+ """
20
+
21
+ # Download the Llama model if it doesn't exist
22
+ downloaded_model_path = hf_hub_download(repo_id=model_path, filename="llama-2-7b-chat.Q5_K_M.gguf")
23
+
24
+ # Initialize the Llama model
25
+ llm = Llama(model_path=downloaded_model_path)
26
+
27
+ # Define the system prompt
28
+ DEFAULT_SYSTEM_PROMPT = """\
29
+ You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\
30
+ """
31
+
32
+ # Define the function to get the prompt
33
+ def get_prompt(message: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
34
+ return f'<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n{message} [/INST]'
35
+
36
+ # Define the function to generate structured data
37
+ def llamacpp_with_json_schema_parser(llm: Llama, prompt: str, json_schema_parser: JsonSchemaParser) -> str:
38
+ logits_processors: LogitsProcessorList = LogitsProcessorList([build_llamacpp_logits_processor(llm, json_schema_parser)])
39
+ output = llm(prompt, logits_processor=logits_processors)
40
+ text: str = output['choices'][0]['text']
41
+ return text
42
+
43
+ # Define the question and prompt
44
+ question = f'Please generate structured data according to the following JSON schema: {pydantic_class.schema_json()}. The input data is: {input_data}'
45
+ prompt = get_prompt(question)
46
+
47
+ # Generate the structured data
48
+ json_schema_parser = JsonSchemaParser(pydantic_class.schema())
49
+ result = llamacpp_with_json_schema_parser(llm, prompt, json_schema_parser)
50
+
51
+ # Return the structured data as a dictionary
52
+ return eval(result)
53
+
54
+ # Example usage:
55
+ class PlanetSchema(BaseModel):
56
+ planet_name: str
57
+
58
+ class PlanetList(BaseModel):
59
+ planets: list[PlanetSchema]
60
+
61
+ input_data = "Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune"
62
+ result = generate_structured_data(input_data, PlanetList)
63
+ print(result)
@@ -0,0 +1 @@
1
+ ref: refs/heads/main
@@ -0,0 +1,11 @@
1
+ [core]
2
+ repositoryformatversion = 0
3
+ filemode = true
4
+ bare = false
5
+ logallrefupdates = true
6
+ [remote "origin"]
7
+ url = https://github.com/iamarunbrahma/pdf-to-markdown
8
+ fetch = +refs/heads/*:refs/remotes/origin/*
9
+ [branch "main"]
10
+ remote = origin
11
+ merge = refs/heads/main
@@ -0,0 +1 @@
1
+ Unnamed repository; edit this file 'description' to name the repository.
@@ -0,0 +1,15 @@
1
+ #!/bin/sh
2
+ #
3
+ # An example hook script to check the commit log message taken by
4
+ # applypatch from an e-mail message.
5
+ #
6
+ # The hook should exit with non-zero status after issuing an
7
+ # appropriate message if it wants to stop the commit. The hook is
8
+ # allowed to edit the commit message file.
9
+ #
10
+ # To enable this hook, rename this file to "applypatch-msg".
11
+
12
+ . git-sh-setup
13
+ commitmsg="$(git rev-parse --git-path hooks/commit-msg)"
14
+ test -x "$commitmsg" && exec "$commitmsg" ${1+"$@"}
15
+ :
@@ -0,0 +1,24 @@
1
+ #!/bin/sh
2
+ #
3
+ # An example hook script to check the commit log message.
4
+ # Called by "git commit" with one argument, the name of the file
5
+ # that has the commit message. The hook should exit with non-zero
6
+ # status after issuing an appropriate message if it wants to stop the
7
+ # commit. The hook is allowed to edit the commit message file.
8
+ #
9
+ # To enable this hook, rename this file to "commit-msg".
10
+
11
+ # Uncomment the below to add a Signed-off-by line to the message.
12
+ # Doing this in a hook is a bad idea in general, but the prepare-commit-msg
13
+ # hook is more suited to it.
14
+ #
15
+ # SOB=$(git var GIT_AUTHOR_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p')
16
+ # grep -qs "^$SOB" "$1" || echo "$SOB" >> "$1"
17
+
18
+ # This example catches duplicate Signed-off-by lines.
19
+
20
+ test "" = "$(grep '^Signed-off-by: ' "$1" |
21
+ sort | uniq -c | sed -e '/^[ ]*1[ ]/d')" || {
22
+ echo >&2 Duplicate Signed-off-by lines.
23
+ exit 1
24
+ }
@@ -0,0 +1,174 @@
1
+ #!/usr/bin/perl
2
+
3
+ use strict;
4
+ use warnings;
5
+ use IPC::Open2;
6
+
7
+ # An example hook script to integrate Watchman
8
+ # (https://facebook.github.io/watchman/) with git to speed up detecting
9
+ # new and modified files.
10
+ #
11
+ # The hook is passed a version (currently 2) and last update token
12
+ # formatted as a string and outputs to stdout a new update token and
13
+ # all files that have been modified since the update token. Paths must
14
+ # be relative to the root of the working tree and separated by a single NUL.
15
+ #
16
+ # To enable this hook, rename this file to "query-watchman" and set
17
+ # 'git config core.fsmonitor .git/hooks/query-watchman'
18
+ #
19
+ my ($version, $last_update_token) = @ARGV;
20
+
21
+ # Uncomment for debugging
22
+ # print STDERR "$0 $version $last_update_token\n";
23
+
24
+ # Check the hook interface version
25
+ if ($version ne 2) {
26
+ die "Unsupported query-fsmonitor hook version '$version'.\n" .
27
+ "Falling back to scanning...\n";
28
+ }
29
+
30
+ my $git_work_tree = get_working_dir();
31
+
32
+ my $retry = 1;
33
+
34
+ my $json_pkg;
35
+ eval {
36
+ require JSON::XS;
37
+ $json_pkg = "JSON::XS";
38
+ 1;
39
+ } or do {
40
+ require JSON::PP;
41
+ $json_pkg = "JSON::PP";
42
+ };
43
+
44
+ launch_watchman();
45
+
46
+ sub launch_watchman {
47
+ my $o = watchman_query();
48
+ if (is_work_tree_watched($o)) {
49
+ output_result($o->{clock}, @{$o->{files}});
50
+ }
51
+ }
52
+
53
+ sub output_result {
54
+ my ($clockid, @files) = @_;
55
+
56
+ # Uncomment for debugging watchman output
57
+ # open (my $fh, ">", ".git/watchman-output.out");
58
+ # binmode $fh, ":utf8";
59
+ # print $fh "$clockid\n@files\n";
60
+ # close $fh;
61
+
62
+ binmode STDOUT, ":utf8";
63
+ print $clockid;
64
+ print "\0";
65
+ local $, = "\0";
66
+ print @files;
67
+ }
68
+
69
+ sub watchman_clock {
70
+ my $response = qx/watchman clock "$git_work_tree"/;
71
+ die "Failed to get clock id on '$git_work_tree'.\n" .
72
+ "Falling back to scanning...\n" if $? != 0;
73
+
74
+ return $json_pkg->new->utf8->decode($response);
75
+ }
76
+
77
+ sub watchman_query {
78
+ my $pid = open2(\*CHLD_OUT, \*CHLD_IN, 'watchman -j --no-pretty')
79
+ or die "open2() failed: $!\n" .
80
+ "Falling back to scanning...\n";
81
+
82
+ # In the query expression below we're asking for names of files that
83
+ # changed since $last_update_token but not from the .git folder.
84
+ #
85
+ # To accomplish this, we're using the "since" generator to use the
86
+ # recency index to select candidate nodes and "fields" to limit the
87
+ # output to file names only. Then we're using the "expression" term to
88
+ # further constrain the results.
89
+ my $last_update_line = "";
90
+ if (substr($last_update_token, 0, 1) eq "c") {
91
+ $last_update_token = "\"$last_update_token\"";
92
+ $last_update_line = qq[\n"since": $last_update_token,];
93
+ }
94
+ my $query = <<" END";
95
+ ["query", "$git_work_tree", {$last_update_line
96
+ "fields": ["name"],
97
+ "expression": ["not", ["dirname", ".git"]]
98
+ }]
99
+ END
100
+
101
+ # Uncomment for debugging the watchman query
102
+ # open (my $fh, ">", ".git/watchman-query.json");
103
+ # print $fh $query;
104
+ # close $fh;
105
+
106
+ print CHLD_IN $query;
107
+ close CHLD_IN;
108
+ my $response = do {local $/; <CHLD_OUT>};
109
+
110
+ # Uncomment for debugging the watch response
111
+ # open ($fh, ">", ".git/watchman-response.json");
112
+ # print $fh $response;
113
+ # close $fh;
114
+
115
+ die "Watchman: command returned no output.\n" .
116
+ "Falling back to scanning...\n" if $response eq "";
117
+ die "Watchman: command returned invalid output: $response\n" .
118
+ "Falling back to scanning...\n" unless $response =~ /^\{/;
119
+
120
+ return $json_pkg->new->utf8->decode($response);
121
+ }
122
+
123
+ sub is_work_tree_watched {
124
+ my ($output) = @_;
125
+ my $error = $output->{error};
126
+ if ($retry > 0 and $error and $error =~ m/unable to resolve root .* directory (.*) is not watched/) {
127
+ $retry--;
128
+ my $response = qx/watchman watch "$git_work_tree"/;
129
+ die "Failed to make watchman watch '$git_work_tree'.\n" .
130
+ "Falling back to scanning...\n" if $? != 0;
131
+ $output = $json_pkg->new->utf8->decode($response);
132
+ $error = $output->{error};
133
+ die "Watchman: $error.\n" .
134
+ "Falling back to scanning...\n" if $error;
135
+
136
+ # Uncomment for debugging watchman output
137
+ # open (my $fh, ">", ".git/watchman-output.out");
138
+ # close $fh;
139
+
140
+ # Watchman will always return all files on the first query so
141
+ # return the fast "everything is dirty" flag to git and do the
142
+ # Watchman query just to get it over with now so we won't pay
143
+ # the cost in git to look up each individual file.
144
+ my $o = watchman_clock();
145
+ $error = $output->{error};
146
+
147
+ die "Watchman: $error.\n" .
148
+ "Falling back to scanning...\n" if $error;
149
+
150
+ output_result($o->{clock}, ("/"));
151
+ $last_update_token = $o->{clock};
152
+
153
+ eval { launch_watchman() };
154
+ return 0;
155
+ }
156
+
157
+ die "Watchman: $error.\n" .
158
+ "Falling back to scanning...\n" if $error;
159
+
160
+ return 1;
161
+ }
162
+
163
+ sub get_working_dir {
164
+ my $working_dir;
165
+ if ($^O =~ 'msys' || $^O =~ 'cygwin') {
166
+ $working_dir = Win32::GetCwd();
167
+ $working_dir =~ tr/\\/\//;
168
+ } else {
169
+ require Cwd;
170
+ $working_dir = Cwd::cwd();
171
+ }
172
+
173
+ return $working_dir;
174
+ }
@@ -0,0 +1,8 @@
1
+ #!/bin/sh
2
+ #
3
+ # An example hook script to prepare a packed repository for use over
4
+ # dumb transports.
5
+ #
6
+ # To enable this hook, rename this file to "post-update".
7
+
8
+ exec git update-server-info
@@ -0,0 +1,14 @@
1
+ #!/bin/sh
2
+ #
3
+ # An example hook script to verify what is about to be committed
4
+ # by applypatch from an e-mail message.
5
+ #
6
+ # The hook should exit with non-zero status after issuing an
7
+ # appropriate message if it wants to stop the commit.
8
+ #
9
+ # To enable this hook, rename this file to "pre-applypatch".
10
+
11
+ . git-sh-setup
12
+ precommit="$(git rev-parse --git-path hooks/pre-commit)"
13
+ test -x "$precommit" && exec "$precommit" ${1+"$@"}
14
+ :
@@ -0,0 +1,49 @@
1
+ #!/bin/sh
2
+ #
3
+ # An example hook script to verify what is about to be committed.
4
+ # Called by "git commit" with no arguments. The hook should
5
+ # exit with non-zero status after issuing an appropriate message if
6
+ # it wants to stop the commit.
7
+ #
8
+ # To enable this hook, rename this file to "pre-commit".
9
+
10
+ if git rev-parse --verify HEAD >/dev/null 2>&1
11
+ then
12
+ against=HEAD
13
+ else
14
+ # Initial commit: diff against an empty tree object
15
+ against=$(git hash-object -t tree /dev/null)
16
+ fi
17
+
18
+ # If you want to allow non-ASCII filenames set this variable to true.
19
+ allownonascii=$(git config --type=bool hooks.allownonascii)
20
+
21
+ # Redirect output to stderr.
22
+ exec 1>&2
23
+
24
+ # Cross platform projects tend to avoid non-ASCII filenames; prevent
25
+ # them from being added to the repository. We exploit the fact that the
26
+ # printable range starts at the space character and ends with tilde.
27
+ if [ "$allownonascii" != "true" ] &&
28
+ # Note that the use of brackets around a tr range is ok here, (it's
29
+ # even required, for portability to Solaris 10's /usr/bin/tr), since
30
+ # the square bracket bytes happen to fall in the designated range.
31
+ test $(git diff-index --cached --name-only --diff-filter=A -z $against |
32
+ LC_ALL=C tr -d '[ -~]\0' | wc -c) != 0
33
+ then
34
+ cat <<\EOF
35
+ Error: Attempt to add a non-ASCII file name.
36
+
37
+ This can cause problems if you want to work with people on other platforms.
38
+
39
+ To be portable it is advisable to rename the file.
40
+
41
+ If you know what you are doing you can disable this check using:
42
+
43
+ git config hooks.allownonascii true
44
+ EOF
45
+ exit 1
46
+ fi
47
+
48
+ # If there are whitespace errors, print the offending file names and fail.
49
+ exec git diff-index --check --cached $against --
@@ -0,0 +1,13 @@
1
+ #!/bin/sh
2
+ #
3
+ # An example hook script to verify what is about to be committed.
4
+ # Called by "git merge" with no arguments. The hook should
5
+ # exit with non-zero status after issuing an appropriate message to
6
+ # stderr if it wants to stop the merge commit.
7
+ #
8
+ # To enable this hook, rename this file to "pre-merge-commit".
9
+
10
+ . git-sh-setup
11
+ test -x "$GIT_DIR/hooks/pre-commit" &&
12
+ exec "$GIT_DIR/hooks/pre-commit"
13
+ :
@@ -0,0 +1,53 @@
1
+ #!/bin/sh
2
+
3
+ # An example hook script to verify what is about to be pushed. Called by "git
4
+ # push" after it has checked the remote status, but before anything has been
5
+ # pushed. If this script exits with a non-zero status nothing will be pushed.
6
+ #
7
+ # This hook is called with the following parameters:
8
+ #
9
+ # $1 -- Name of the remote to which the push is being done
10
+ # $2 -- URL to which the push is being done
11
+ #
12
+ # If pushing without using a named remote those arguments will be equal.
13
+ #
14
+ # Information about the commits which are being pushed is supplied as lines to
15
+ # the standard input in the form:
16
+ #
17
+ # <local ref> <local oid> <remote ref> <remote oid>
18
+ #
19
+ # This sample shows how to prevent push of commits where the log message starts
20
+ # with "WIP" (work in progress).
21
+
22
+ remote="$1"
23
+ url="$2"
24
+
25
+ zero=$(git hash-object --stdin </dev/null | tr '[0-9a-f]' '0')
26
+
27
+ while read local_ref local_oid remote_ref remote_oid
28
+ do
29
+ if test "$local_oid" = "$zero"
30
+ then
31
+ # Handle delete
32
+ :
33
+ else
34
+ if test "$remote_oid" = "$zero"
35
+ then
36
+ # New branch, examine all commits
37
+ range="$local_oid"
38
+ else
39
+ # Update to existing branch, examine new commits
40
+ range="$remote_oid..$local_oid"
41
+ fi
42
+
43
+ # Check for WIP commit
44
+ commit=$(git rev-list -n 1 --grep '^WIP' "$range")
45
+ if test -n "$commit"
46
+ then
47
+ echo >&2 "Found WIP commit in $local_ref, not pushing"
48
+ exit 1
49
+ fi
50
+ fi
51
+ done
52
+
53
+ exit 0