pembot 0.0.3__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pembot might be problematic. Click here for more details.

Files changed (129) hide show
  1. pembot/.git/COMMIT_EDITMSG +1 -0
  2. pembot/.git/HEAD +1 -0
  3. pembot/.git/config +11 -0
  4. pembot/.git/description +1 -0
  5. pembot/.git/hooks/applypatch-msg.sample +15 -0
  6. pembot/.git/hooks/commit-msg.sample +24 -0
  7. pembot/.git/hooks/fsmonitor-watchman.sample +174 -0
  8. pembot/.git/hooks/post-update.sample +8 -0
  9. pembot/.git/hooks/pre-applypatch.sample +14 -0
  10. pembot/.git/hooks/pre-commit.sample +49 -0
  11. pembot/.git/hooks/pre-merge-commit.sample +13 -0
  12. pembot/.git/hooks/pre-push.sample +53 -0
  13. pembot/.git/hooks/pre-rebase.sample +169 -0
  14. pembot/.git/hooks/pre-receive.sample +24 -0
  15. pembot/.git/hooks/prepare-commit-msg.sample +42 -0
  16. pembot/.git/hooks/push-to-checkout.sample +78 -0
  17. pembot/.git/hooks/sendemail-validate.sample +77 -0
  18. pembot/.git/hooks/update.sample +128 -0
  19. pembot/.git/index +0 -0
  20. pembot/.git/info/exclude +6 -0
  21. pembot/.git/logs/HEAD +6 -0
  22. pembot/.git/logs/refs/heads/main +6 -0
  23. pembot/.git/logs/refs/remotes/origin/HEAD +1 -0
  24. pembot/.git/logs/refs/remotes/origin/main +5 -0
  25. pembot/.git/objects/0a/fb3a98cdc55b1434b44534ec2bf22c56cfa26c +0 -0
  26. pembot/.git/objects/0c/8d9b2690545bf1906b05cd9f18b783b3eb74f1 +0 -0
  27. pembot/.git/objects/18/28e18ab80aa64d334b26428708140e280cbc63 +0 -0
  28. pembot/.git/objects/19/f61df7dbd562d04f561288677bbf2f18f5dff7 +0 -0
  29. pembot/.git/objects/28/db0ab48059acccd7d257aa02e52e9b6b83a4a5 +0 -0
  30. pembot/.git/objects/35/97e518a8658280be9f377f78edf1dfa1f23814 +0 -0
  31. pembot/.git/objects/3d/07d3b29ff53d95de3898fb786d61732f210515 +0 -0
  32. pembot/.git/objects/3e/cf23eb95123287531d708a21d4ba88d92ccabb +0 -0
  33. pembot/.git/objects/3f/78215d7e17da726fb352fd92b3c117db9b63ba +0 -0
  34. pembot/.git/objects/3f/e072cf3cb6a9f30c3e9936e3ddf622e80270d0 +0 -0
  35. pembot/.git/objects/51/9e780574933d7627a083222bd10dd74f430904 +0 -0
  36. pembot/.git/objects/61/46a371b9c1bd9f51af273f11f986cfd1bedeba +0 -0
  37. pembot/.git/objects/64/00040794955d17c9a1fe1aaaea59f2c4822177 +0 -0
  38. pembot/.git/objects/6d/7a865a23b1cb4182f67907820104ced48b11c9 +0 -0
  39. pembot/.git/objects/72/f047cda92abcd1ddc857f6461de605f8668331 +0 -0
  40. pembot/.git/objects/73/2e98f08bc806c331b06847fc8c743f545499e5 +0 -0
  41. pembot/.git/objects/86/cdaec229f1fbebf43042266b03878944669f25 +0 -0
  42. pembot/.git/objects/87/d6df5217a4a374f8c1211a05f9bd657f72c9a7 +0 -0
  43. pembot/.git/objects/8b/5be2af9b16f290549193859c214cd9072212e8 +0 -0
  44. pembot/.git/objects/93/8f29d9b4b1ae86e39dddf9e3d115a82ddfc9b6 +0 -0
  45. pembot/.git/objects/9b/123713e30fc9e225f9ac8ff5b02f8f8cf86456 +0 -0
  46. pembot/.git/objects/ab/c6b15265171457b41e2cfdaf3b8c3994a59eb7 +0 -0
  47. pembot/.git/objects/ac/9c9018c62fa30dc142665c1b5a375f4e056880 +0 -0
  48. pembot/.git/objects/b1/1173d9b68db117437ccb9551461152e1e8a77d +0 -0
  49. pembot/.git/objects/b2/4e79ab07fe9e68781961a25ff9f1dbb1546fbb +0 -0
  50. pembot/.git/objects/b8/eea52176ffa4d88c5a9976bee26092421565d3 +0 -0
  51. pembot/.git/objects/bf/32a7e6872e5dc4025ee3df3c921ec7ade0855f +0 -0
  52. pembot/.git/objects/c0/793458db6e1bee7f79f1a504fb8ff4963f8ed3 +0 -0
  53. pembot/.git/objects/c2/443060c07101948487cfa93cc39e082e9e0f5f +0 -0
  54. pembot/.git/objects/e5/3070f2b07f45d031444b09b1b38658f3caf29e +0 -0
  55. pembot/.git/objects/e7/911a702079a6144997ea4e70f59abbe59ec2bc +0 -0
  56. pembot/.git/objects/e9/1172752e9a421ae463112d2b0506b37498c98d +0 -0
  57. pembot/.git/objects/ea/0af89e61a882c5afc2a8c281b2d96f174bfe58 +0 -0
  58. pembot/.git/objects/eb/75e1c49f1e5b79dca17ccdbec8067756523238 +0 -0
  59. pembot/.git/objects/f1/655afa1c5636c8d58969e3194bb770aefbc552 +0 -0
  60. pembot/.git/objects/f4/e991088a63def67a30a2b8bbdb4d58514abab8 +0 -0
  61. pembot/.git/objects/f8/cbb5bfd1503e66cec2c593362c60a317b6d300 +0 -0
  62. pembot/.git/objects/f9/98e1f01c2bf0a20159fc851327af05beb3ac88 +0 -0
  63. pembot/.git/objects/fa/9c9a62ec1203a5868b033ded428c2382c4e1b6 +0 -0
  64. pembot/.git/objects/fb/6c90c9ce5e0cdfbe074a3f060afc66f62eefde +0 -0
  65. pembot/.git/objects/fc/e56f1e09d09a05b9babf796fb40bece176f3a2 +0 -0
  66. pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.idx +0 -0
  67. pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.pack +0 -0
  68. pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.rev +0 -0
  69. pembot/.git/packed-refs +2 -0
  70. pembot/.git/refs/heads/main +1 -0
  71. pembot/.git/refs/remotes/origin/HEAD +1 -0
  72. pembot/.git/refs/remotes/origin/main +1 -0
  73. pembot/.gitignore +7 -0
  74. pembot/AnyToText/__init__.py +0 -0
  75. pembot/AnyToText/convertor.py +260 -0
  76. pembot/LICENSE +674 -0
  77. pembot/TextEmbedder/__init__.py +0 -0
  78. pembot/TextEmbedder/gemini_embedder.py +27 -0
  79. pembot/TextEmbedder/mongodb_embedder.py +258 -0
  80. pembot/TextEmbedder/mongodb_index_creator.py +133 -0
  81. pembot/TextEmbedder/vector_query.py +64 -0
  82. pembot/__init__.py +6 -0
  83. pembot/config/config.yaml +5 -0
  84. pembot/gartner.py +140 -0
  85. pembot/main.py +208 -0
  86. pembot/output_structure_local.py +63 -0
  87. pembot/pdf2markdown/.git/HEAD +1 -0
  88. pembot/pdf2markdown/.git/config +11 -0
  89. pembot/pdf2markdown/.git/description +1 -0
  90. pembot/pdf2markdown/.git/hooks/applypatch-msg.sample +15 -0
  91. pembot/pdf2markdown/.git/hooks/commit-msg.sample +24 -0
  92. pembot/pdf2markdown/.git/hooks/fsmonitor-watchman.sample +174 -0
  93. pembot/pdf2markdown/.git/hooks/post-update.sample +8 -0
  94. pembot/pdf2markdown/.git/hooks/pre-applypatch.sample +14 -0
  95. pembot/pdf2markdown/.git/hooks/pre-commit.sample +49 -0
  96. pembot/pdf2markdown/.git/hooks/pre-merge-commit.sample +13 -0
  97. pembot/pdf2markdown/.git/hooks/pre-push.sample +53 -0
  98. pembot/pdf2markdown/.git/hooks/pre-rebase.sample +169 -0
  99. pembot/pdf2markdown/.git/hooks/pre-receive.sample +24 -0
  100. pembot/pdf2markdown/.git/hooks/prepare-commit-msg.sample +42 -0
  101. pembot/pdf2markdown/.git/hooks/push-to-checkout.sample +78 -0
  102. pembot/pdf2markdown/.git/hooks/sendemail-validate.sample +77 -0
  103. pembot/pdf2markdown/.git/hooks/update.sample +128 -0
  104. pembot/pdf2markdown/.git/index +0 -0
  105. pembot/pdf2markdown/.git/info/exclude +6 -0
  106. pembot/pdf2markdown/.git/logs/HEAD +1 -0
  107. pembot/pdf2markdown/.git/logs/refs/heads/main +1 -0
  108. pembot/pdf2markdown/.git/logs/refs/remotes/origin/HEAD +1 -0
  109. pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.idx +0 -0
  110. pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.pack +0 -0
  111. pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.rev +0 -0
  112. pembot/pdf2markdown/.git/packed-refs +2 -0
  113. pembot/pdf2markdown/.git/refs/heads/main +1 -0
  114. pembot/pdf2markdown/.git/refs/remotes/origin/HEAD +1 -0
  115. pembot/pdf2markdown/LICENSE +21 -0
  116. pembot/pdf2markdown/README.md +107 -0
  117. pembot/pdf2markdown/__init__.py +0 -0
  118. pembot/pdf2markdown/config/config.yaml +2 -0
  119. pembot/pdf2markdown/extract.py +888 -0
  120. pembot/pdf2markdown/requirements.txt +8 -0
  121. pembot/pem.py +157 -0
  122. pembot/query.py +204 -0
  123. pembot/utils/__init__.py +0 -0
  124. pembot/utils/inference_client.py +132 -0
  125. pembot/utils/string_tools.py +45 -0
  126. pembot-0.0.3.dist-info/METADATA +8 -0
  127. pembot-0.0.3.dist-info/RECORD +129 -0
  128. pembot-0.0.3.dist-info/WHEEL +5 -0
  129. pembot-0.0.3.dist-info/licenses/LICENSE +674 -0
@@ -0,0 +1,260 @@
1
+ from tempfile import TemporaryDirectory
2
+ import mimetypes
3
+ from pathlib import Path
4
+ from pembot.pdf2markdown.extract import MarkdownPDFExtractor
5
+ import os
6
+ import json
7
+ import pandas as pd
8
+ from typing import Literal, Union, Dict, Any, List
9
+ import tempfile
10
+
11
+
12
+ PandasReadEngineType = Literal['xlrd', 'openpyxl', 'odf', 'pyxlsb', 'calamine', None]
13
+
14
+ EXCEL_FILE_TYPES= [
15
+ 'text/csv',
16
+ 'application/vnd.ms-excel',
17
+ 'application/msexcel',
18
+ 'application/x-msexcel',
19
+ 'application/x-ms-excel',
20
+ 'application/x-excel',
21
+ 'application/x-dos_ms_excel',
22
+ 'application/x-dos_ms_excel',
23
+ 'application/xls',
24
+ 'application/x-xls',
25
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
26
+ 'application/vnd.oasis.opendocument.spreadsheet',
27
+ ]
28
+
29
+
30
+ class Convertor():
31
+
32
+
33
+ def __init__(self, myfile: Path | None, output_dir: Path | None, file_bytes: bytes | None, suffix: str | None, file_type: str | None):
34
+
35
+ self.output= ""
36
+
37
+ # file_type can be pdf, excel, etc.
38
+ if output_dir is None and file_bytes is not None and suffix is not None and myfile is None:
39
+ with tempfile.TemporaryDirectory() as dp:
40
+ with tempfile.NamedTemporaryFile(suffix= suffix, mode= 'wb') as fp:
41
+ fp.write(file_bytes)
42
+ myfile= Path(fp.name)
43
+ output_dir= Path(dp)
44
+ if file_type == 'pdf':
45
+ extractor= MarkdownPDFExtractor(str(myfile), output_path= str(self.output_dir), page_delimiter= "-- NEXT PAGE --")
46
+ extractor.extract()
47
+ with open(output_dir / (myfile.stem + '.md')) as output_file:
48
+ self.output= output_file.read()
49
+ elif file_type == 'excel':
50
+ self.input_filepath= myfile
51
+ self.json_filepath = output_dir / (myfile.stem + ".json")
52
+ self.convert_file_to_json()
53
+ with open(output_dir / (myfile.stem + '.json')) as output_file:
54
+ self.output= output_file.read()
55
+
56
+ elif output_dir is not None and myfile is not None:
57
+ print("got output path for conversion: ", output_dir)
58
+ mt= mimetypes.guess_file_type(str(myfile))[0]
59
+
60
+ self.output_dir= output_dir
61
+ self.input_filepath= myfile
62
+ base_name, _ = os.path.splitext(myfile.name)
63
+ self.json_filepath = output_dir / 'json' / (base_name + ".json")
64
+
65
+ if mt == 'application/json':
66
+ print("the file was json")
67
+ elif mt == 'application/pdf':
68
+ print("the file was pdf, outputting in: ", output_dir)
69
+ extractor= MarkdownPDFExtractor(str(myfile), output_path= str(self.output_dir), page_delimiter= "-- NEXT PAGE --")
70
+ extractor.extract()
71
+
72
+ elif mt in EXCEL_FILE_TYPES:
73
+ self.convert_file_to_json()
74
+
75
+ else:
76
+ print(mt)
77
+
78
+
79
+
80
+ def convert_file_to_json(
81
+ self,
82
+ sheet_to_convert: Union[str, int, None] = None, # Relevant for Excel/ODS
83
+ orient: Literal['dict', 'list', 'series', 'split', 'records', 'index'] = 'records', # Corrected type hint
84
+ date_format: Union[str, None] = 'iso', # 'iso', 'epoch', or None
85
+ csv_encoding: str = 'utf-8', # For reading CSV files
86
+ excel_ods_engine: PandasReadEngineType = None # For Excel/ODS, e.g., 'openpyxl', 'xlrd', 'odf'
87
+ ) -> bool:
88
+ """
89
+ Converts an Excel, ODS, or CSV file (or a specific Excel/ODS sheet)
90
+ into an equivalent JSON format.
91
+
92
+ Args:
93
+ sheet_to_convert (str | int | None, optional):
94
+ - For Excel/ODS:
95
+ - If None (default): Converts all sheets. The JSON output will be a
96
+ dictionary where keys are sheet names and values are the JSON
97
+ representation of each sheet.
98
+ - If str: Name of the specific sheet to convert.
99
+ - If int: Index of the specific sheet to convert (0-based).
100
+ If a specific sheet is requested, the JSON output will directly be
101
+ the representation of that sheet.
102
+ - For CSV: This parameter is ignored. The entire CSV is processed.
103
+ orient (str, optional): Pandas DataFrame.to_dict() orientation for each sheet/CSV.
104
+ Default: 'records'. See pandas.DataFrame.to_dict() documentation.
105
+ date_format (str | None, optional): Format for datetime objects.
106
+ - 'iso' (default): ISO8601 format (e.g., '2023-10-27T10:30:00').
107
+ - 'epoch': Milliseconds since epoch.
108
+ - None: Pandas default (often Timestamps). 'iso' is generally safer for JSON.
109
+ csv_encoding (str, optional): Encoding for reading CSV files. Default is 'utf-8'.
110
+ excel_ods_engine (str | None, optional): Pandas engine for reading Excel or ODS files.
111
+ - For Excel: 'openpyxl' (for .xlsx), 'xlrd' (for .xls).
112
+ - For ODS: 'odf' (requires 'odfpy' library).
113
+ If None, pandas auto-detects based on file extension and installed libraries.
114
+
115
+ Returns:
116
+ bool: True if conversion was successful, False otherwise.
117
+ """
118
+ input_filepath = self.input_filepath
119
+ json_filepath = self.json_filepath
120
+
121
+ try:
122
+
123
+ if not input_filepath.exists():
124
+ print(f"Error: Input file not found at {input_filepath}")
125
+ return False
126
+
127
+ # Ensure output directory exists
128
+ json_filepath.parent.mkdir(parents=True, exist_ok=True)
129
+
130
+ file_suffix = input_filepath.suffix.lower()
131
+ output_data_final: Union[Dict[str, Any], List[Dict[str, Any]]] = {}
132
+
133
+ dataframes_to_process: list[tuple[pd.DataFrame, str | None]] = []
134
+
135
+ current_engine: PandasReadEngineType = excel_ods_engine
136
+
137
+ if file_suffix == '.csv':
138
+ if sheet_to_convert is not None:
139
+ print(f"Info: 'sheet_to_convert' parameter ('{sheet_to_convert}') is ignored for CSV file '{input_filepath.name}'. Processing entire CSV.")
140
+ try:
141
+ df = pd.read_csv(input_filepath, encoding=csv_encoding)
142
+ dataframes_to_process.append((df, None))
143
+ except Exception as e:
144
+ print(f"Error reading CSV file '{input_filepath.name}': {e}")
145
+ return False
146
+
147
+ elif file_suffix in ['.xls', '.xlsx', '.ods']:
148
+ try:
149
+ if file_suffix == '.ods':
150
+ if current_engine is None:
151
+ current_engine = 'odf'
152
+ elif current_engine != 'odf':
153
+ print(f"Warning: Specified engine '{current_engine}' may not be optimal for ODS. Forcing 'odf'.")
154
+ current_engine = 'odf'
155
+
156
+ if sheet_to_convert is not None:
157
+ df = pd.read_excel(input_filepath, sheet_name=sheet_to_convert, engine=current_engine)
158
+ dataframes_to_process.append((df, None))
159
+
160
+ else:
161
+ excel_file = pd.ExcelFile(input_filepath, engine=current_engine)
162
+ if not excel_file.sheet_names:
163
+ print(f"Warning: File '{input_filepath.name}' contains no sheets.")
164
+ for sheet_name in excel_file.sheet_names:
165
+ df = excel_file.parse(sheet_name) # engine is inherited
166
+ dataframes_to_process.append((df, sheet_name))
167
+ except ImportError as ie:
168
+ if 'odfpy' in str(ie).lower() and file_suffix == '.ods':
169
+ print(f"Error reading ODS file '{input_filepath.name}': The 'odfpy' library is required. Please install it using 'pip install odfpy'.")
170
+ elif 'xlrd' in str(ie).lower() and file_suffix == '.xls':
171
+ print(f"Error reading .xls file '{input_filepath.name}': The 'xlrd' library might be required. Please install it using 'pip install xlrd'.")
172
+ elif 'openpyxl' in str(ie).lower() and file_suffix == '.xlsx':
173
+ print(f"Error reading .xlsx file '{input_filepath.name}': The 'openpyxl' library might be required. Please install it using 'pip install openpyxl'.")
174
+ else:
175
+ print(f"ImportError reading file '{input_filepath.name}': {ie}")
176
+ return False
177
+ except Exception as e:
178
+ print(f"Error reading Excel/ODS file '{input_filepath.name}': {e}")
179
+ return False
180
+ else:
181
+ print(f"Error: Unsupported file type: '{file_suffix}'. Please provide a CSV, XLS, XLSX, or ODS file.")
182
+ return False
183
+
184
+ if not dataframes_to_process and file_suffix in ['.xls', '.xlsx', '.ods'] and sheet_to_convert is None:
185
+ print(f"Info: No dataframes were loaded from '{input_filepath.name}'. Output JSON will be empty if processing all sheets from an empty file.")
186
+ elif not dataframes_to_process and not (file_suffix in ['.xls', '.xlsx', '.ods'] and sheet_to_convert is None):
187
+ pass
188
+
189
+
190
+ is_direct_output = len(dataframes_to_process) == 1 and dataframes_to_process[0][1] is None
191
+ temp_processed_data: Dict[str, Any] = {}
192
+
193
+ for df_original, name_key in dataframes_to_process:
194
+ df = df_original.copy()
195
+
196
+ if date_format:
197
+ for col_name in df.select_dtypes(include=['datetime64[ns]', 'datetime', 'datetimetz']).columns:
198
+ try:
199
+ if date_format == 'iso':
200
+ df[col_name] = df[col_name].apply(lambda x: x.isoformat() if pd.notnull(x) and hasattr(x, 'isoformat') else None)
201
+ elif date_format == 'epoch':
202
+ df[col_name] = df[col_name].apply(lambda x: int(x.timestamp() * 1000) if pd.notnull(x) and hasattr(x, 'timestamp') else None)
203
+ except Exception as e_date:
204
+ print(f"Warning: Could not fully convert date column '{col_name}' in '{name_key or input_filepath.name}' using format '{date_format}'. Error: {e_date}. Problematic values might be None.")
205
+
206
+ df = df.astype(object).where(pd.notnull(df), None)
207
+ current_json_segment = df.to_dict(orient=orient)
208
+
209
+ if is_direct_output:
210
+ output_data_final = current_json_segment
211
+ break
212
+ else:
213
+ if name_key is not None:
214
+ temp_processed_data[name_key] = current_json_segment
215
+
216
+ if not is_direct_output:
217
+ output_data_final = temp_processed_data
218
+
219
+ with open(json_filepath, 'w', encoding='utf-8') as f:
220
+ json.dump(output_data_final, f, indent=4, ensure_ascii=False)
221
+
222
+ print(f"Successfully converted '{input_filepath.name}' to '{json_filepath.name}'")
223
+ return True
224
+
225
+ except FileNotFoundError:
226
+ print(f"Error: Input file not found at {input_filepath.name}")
227
+ return False
228
+ except ValueError as ve:
229
+ print(f"ValueError during conversion of '{input_filepath.name}': {ve}")
230
+ return False
231
+ except Exception as e:
232
+ print(f"An unexpected error occurred during conversion of '{input_filepath.name}': {e}")
233
+ return False
234
+
235
+
236
+ def chunk_text(text, chunk_size=500, overlap_size=50):
237
+ """
238
+ Chunks a given text into smaller pieces with optional overlap.
239
+
240
+ Args:
241
+ text (str): The input text to be chunked.
242
+ chunk_size (int): The maximum size of each chunk (in characters).
243
+ overlap_size (int): The number of characters to overlap between consecutive chunks.
244
+
245
+ Returns:
246
+ list: A list of text chunks.
247
+ """
248
+ chunks = []
249
+ start = 0
250
+ while start < len(text):
251
+ end = start + chunk_size
252
+ chunk = text[start:end]
253
+ chunks.append(chunk)
254
+ start += (chunk_size - overlap_size)
255
+ if start < 0: # Handle cases where overlap_size is greater than chunk_size
256
+ start = 0
257
+ return chunks
258
+
259
+ if __name__ == '__main__':
260
+ print("do you want a rice bag?")