pembot 0.0.3__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pembot might be problematic. Click here for more details.
- pembot/.git/COMMIT_EDITMSG +1 -0
- pembot/.git/HEAD +1 -0
- pembot/.git/config +11 -0
- pembot/.git/description +1 -0
- pembot/.git/hooks/applypatch-msg.sample +15 -0
- pembot/.git/hooks/commit-msg.sample +24 -0
- pembot/.git/hooks/fsmonitor-watchman.sample +174 -0
- pembot/.git/hooks/post-update.sample +8 -0
- pembot/.git/hooks/pre-applypatch.sample +14 -0
- pembot/.git/hooks/pre-commit.sample +49 -0
- pembot/.git/hooks/pre-merge-commit.sample +13 -0
- pembot/.git/hooks/pre-push.sample +53 -0
- pembot/.git/hooks/pre-rebase.sample +169 -0
- pembot/.git/hooks/pre-receive.sample +24 -0
- pembot/.git/hooks/prepare-commit-msg.sample +42 -0
- pembot/.git/hooks/push-to-checkout.sample +78 -0
- pembot/.git/hooks/sendemail-validate.sample +77 -0
- pembot/.git/hooks/update.sample +128 -0
- pembot/.git/index +0 -0
- pembot/.git/info/exclude +6 -0
- pembot/.git/logs/HEAD +6 -0
- pembot/.git/logs/refs/heads/main +6 -0
- pembot/.git/logs/refs/remotes/origin/HEAD +1 -0
- pembot/.git/logs/refs/remotes/origin/main +5 -0
- pembot/.git/objects/0a/fb3a98cdc55b1434b44534ec2bf22c56cfa26c +0 -0
- pembot/.git/objects/0c/8d9b2690545bf1906b05cd9f18b783b3eb74f1 +0 -0
- pembot/.git/objects/18/28e18ab80aa64d334b26428708140e280cbc63 +0 -0
- pembot/.git/objects/19/f61df7dbd562d04f561288677bbf2f18f5dff7 +0 -0
- pembot/.git/objects/28/db0ab48059acccd7d257aa02e52e9b6b83a4a5 +0 -0
- pembot/.git/objects/35/97e518a8658280be9f377f78edf1dfa1f23814 +0 -0
- pembot/.git/objects/3d/07d3b29ff53d95de3898fb786d61732f210515 +0 -0
- pembot/.git/objects/3e/cf23eb95123287531d708a21d4ba88d92ccabb +0 -0
- pembot/.git/objects/3f/78215d7e17da726fb352fd92b3c117db9b63ba +0 -0
- pembot/.git/objects/3f/e072cf3cb6a9f30c3e9936e3ddf622e80270d0 +0 -0
- pembot/.git/objects/51/9e780574933d7627a083222bd10dd74f430904 +0 -0
- pembot/.git/objects/61/46a371b9c1bd9f51af273f11f986cfd1bedeba +0 -0
- pembot/.git/objects/64/00040794955d17c9a1fe1aaaea59f2c4822177 +0 -0
- pembot/.git/objects/6d/7a865a23b1cb4182f67907820104ced48b11c9 +0 -0
- pembot/.git/objects/72/f047cda92abcd1ddc857f6461de605f8668331 +0 -0
- pembot/.git/objects/73/2e98f08bc806c331b06847fc8c743f545499e5 +0 -0
- pembot/.git/objects/86/cdaec229f1fbebf43042266b03878944669f25 +0 -0
- pembot/.git/objects/87/d6df5217a4a374f8c1211a05f9bd657f72c9a7 +0 -0
- pembot/.git/objects/8b/5be2af9b16f290549193859c214cd9072212e8 +0 -0
- pembot/.git/objects/93/8f29d9b4b1ae86e39dddf9e3d115a82ddfc9b6 +0 -0
- pembot/.git/objects/9b/123713e30fc9e225f9ac8ff5b02f8f8cf86456 +0 -0
- pembot/.git/objects/ab/c6b15265171457b41e2cfdaf3b8c3994a59eb7 +0 -0
- pembot/.git/objects/ac/9c9018c62fa30dc142665c1b5a375f4e056880 +0 -0
- pembot/.git/objects/b1/1173d9b68db117437ccb9551461152e1e8a77d +0 -0
- pembot/.git/objects/b2/4e79ab07fe9e68781961a25ff9f1dbb1546fbb +0 -0
- pembot/.git/objects/b8/eea52176ffa4d88c5a9976bee26092421565d3 +0 -0
- pembot/.git/objects/bf/32a7e6872e5dc4025ee3df3c921ec7ade0855f +0 -0
- pembot/.git/objects/c0/793458db6e1bee7f79f1a504fb8ff4963f8ed3 +0 -0
- pembot/.git/objects/c2/443060c07101948487cfa93cc39e082e9e0f5f +0 -0
- pembot/.git/objects/e5/3070f2b07f45d031444b09b1b38658f3caf29e +0 -0
- pembot/.git/objects/e7/911a702079a6144997ea4e70f59abbe59ec2bc +0 -0
- pembot/.git/objects/e9/1172752e9a421ae463112d2b0506b37498c98d +0 -0
- pembot/.git/objects/ea/0af89e61a882c5afc2a8c281b2d96f174bfe58 +0 -0
- pembot/.git/objects/eb/75e1c49f1e5b79dca17ccdbec8067756523238 +0 -0
- pembot/.git/objects/f1/655afa1c5636c8d58969e3194bb770aefbc552 +0 -0
- pembot/.git/objects/f4/e991088a63def67a30a2b8bbdb4d58514abab8 +0 -0
- pembot/.git/objects/f8/cbb5bfd1503e66cec2c593362c60a317b6d300 +0 -0
- pembot/.git/objects/f9/98e1f01c2bf0a20159fc851327af05beb3ac88 +0 -0
- pembot/.git/objects/fa/9c9a62ec1203a5868b033ded428c2382c4e1b6 +0 -0
- pembot/.git/objects/fb/6c90c9ce5e0cdfbe074a3f060afc66f62eefde +0 -0
- pembot/.git/objects/fc/e56f1e09d09a05b9babf796fb40bece176f3a2 +0 -0
- pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.idx +0 -0
- pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.pack +0 -0
- pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.rev +0 -0
- pembot/.git/packed-refs +2 -0
- pembot/.git/refs/heads/main +1 -0
- pembot/.git/refs/remotes/origin/HEAD +1 -0
- pembot/.git/refs/remotes/origin/main +1 -0
- pembot/.gitignore +7 -0
- pembot/AnyToText/__init__.py +0 -0
- pembot/AnyToText/convertor.py +260 -0
- pembot/LICENSE +674 -0
- pembot/TextEmbedder/__init__.py +0 -0
- pembot/TextEmbedder/gemini_embedder.py +27 -0
- pembot/TextEmbedder/mongodb_embedder.py +258 -0
- pembot/TextEmbedder/mongodb_index_creator.py +133 -0
- pembot/TextEmbedder/vector_query.py +64 -0
- pembot/__init__.py +6 -0
- pembot/config/config.yaml +5 -0
- pembot/gartner.py +140 -0
- pembot/main.py +208 -0
- pembot/output_structure_local.py +63 -0
- pembot/pdf2markdown/.git/HEAD +1 -0
- pembot/pdf2markdown/.git/config +11 -0
- pembot/pdf2markdown/.git/description +1 -0
- pembot/pdf2markdown/.git/hooks/applypatch-msg.sample +15 -0
- pembot/pdf2markdown/.git/hooks/commit-msg.sample +24 -0
- pembot/pdf2markdown/.git/hooks/fsmonitor-watchman.sample +174 -0
- pembot/pdf2markdown/.git/hooks/post-update.sample +8 -0
- pembot/pdf2markdown/.git/hooks/pre-applypatch.sample +14 -0
- pembot/pdf2markdown/.git/hooks/pre-commit.sample +49 -0
- pembot/pdf2markdown/.git/hooks/pre-merge-commit.sample +13 -0
- pembot/pdf2markdown/.git/hooks/pre-push.sample +53 -0
- pembot/pdf2markdown/.git/hooks/pre-rebase.sample +169 -0
- pembot/pdf2markdown/.git/hooks/pre-receive.sample +24 -0
- pembot/pdf2markdown/.git/hooks/prepare-commit-msg.sample +42 -0
- pembot/pdf2markdown/.git/hooks/push-to-checkout.sample +78 -0
- pembot/pdf2markdown/.git/hooks/sendemail-validate.sample +77 -0
- pembot/pdf2markdown/.git/hooks/update.sample +128 -0
- pembot/pdf2markdown/.git/index +0 -0
- pembot/pdf2markdown/.git/info/exclude +6 -0
- pembot/pdf2markdown/.git/logs/HEAD +1 -0
- pembot/pdf2markdown/.git/logs/refs/heads/main +1 -0
- pembot/pdf2markdown/.git/logs/refs/remotes/origin/HEAD +1 -0
- pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.idx +0 -0
- pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.pack +0 -0
- pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.rev +0 -0
- pembot/pdf2markdown/.git/packed-refs +2 -0
- pembot/pdf2markdown/.git/refs/heads/main +1 -0
- pembot/pdf2markdown/.git/refs/remotes/origin/HEAD +1 -0
- pembot/pdf2markdown/LICENSE +21 -0
- pembot/pdf2markdown/README.md +107 -0
- pembot/pdf2markdown/__init__.py +0 -0
- pembot/pdf2markdown/config/config.yaml +2 -0
- pembot/pdf2markdown/extract.py +888 -0
- pembot/pdf2markdown/requirements.txt +8 -0
- pembot/pem.py +157 -0
- pembot/query.py +204 -0
- pembot/utils/__init__.py +0 -0
- pembot/utils/inference_client.py +132 -0
- pembot/utils/string_tools.py +45 -0
- pembot-0.0.3.dist-info/METADATA +8 -0
- pembot-0.0.3.dist-info/RECORD +129 -0
- pembot-0.0.3.dist-info/WHEEL +5 -0
- pembot-0.0.3.dist-info/licenses/LICENSE +674 -0
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
from tempfile import TemporaryDirectory
|
|
2
|
+
import mimetypes
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from pembot.pdf2markdown.extract import MarkdownPDFExtractor
|
|
5
|
+
import os
|
|
6
|
+
import json
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from typing import Literal, Union, Dict, Any, List
|
|
9
|
+
import tempfile
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
PandasReadEngineType = Literal['xlrd', 'openpyxl', 'odf', 'pyxlsb', 'calamine', None]
|
|
13
|
+
|
|
14
|
+
EXCEL_FILE_TYPES= [
|
|
15
|
+
'text/csv',
|
|
16
|
+
'application/vnd.ms-excel',
|
|
17
|
+
'application/msexcel',
|
|
18
|
+
'application/x-msexcel',
|
|
19
|
+
'application/x-ms-excel',
|
|
20
|
+
'application/x-excel',
|
|
21
|
+
'application/x-dos_ms_excel',
|
|
22
|
+
'application/x-dos_ms_excel',
|
|
23
|
+
'application/xls',
|
|
24
|
+
'application/x-xls',
|
|
25
|
+
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
|
26
|
+
'application/vnd.oasis.opendocument.spreadsheet',
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class Convertor():
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def __init__(self, myfile: Path | None, output_dir: Path | None, file_bytes: bytes | None, suffix: str | None, file_type: str | None):
|
|
34
|
+
|
|
35
|
+
self.output= ""
|
|
36
|
+
|
|
37
|
+
# file_type can be pdf, excel, etc.
|
|
38
|
+
if output_dir is None and file_bytes is not None and suffix is not None and myfile is None:
|
|
39
|
+
with tempfile.TemporaryDirectory() as dp:
|
|
40
|
+
with tempfile.NamedTemporaryFile(suffix= suffix, mode= 'wb') as fp:
|
|
41
|
+
fp.write(file_bytes)
|
|
42
|
+
myfile= Path(fp.name)
|
|
43
|
+
output_dir= Path(dp)
|
|
44
|
+
if file_type == 'pdf':
|
|
45
|
+
extractor= MarkdownPDFExtractor(str(myfile), output_path= str(self.output_dir), page_delimiter= "-- NEXT PAGE --")
|
|
46
|
+
extractor.extract()
|
|
47
|
+
with open(output_dir / (myfile.stem + '.md')) as output_file:
|
|
48
|
+
self.output= output_file.read()
|
|
49
|
+
elif file_type == 'excel':
|
|
50
|
+
self.input_filepath= myfile
|
|
51
|
+
self.json_filepath = output_dir / (myfile.stem + ".json")
|
|
52
|
+
self.convert_file_to_json()
|
|
53
|
+
with open(output_dir / (myfile.stem + '.json')) as output_file:
|
|
54
|
+
self.output= output_file.read()
|
|
55
|
+
|
|
56
|
+
elif output_dir is not None and myfile is not None:
|
|
57
|
+
print("got output path for conversion: ", output_dir)
|
|
58
|
+
mt= mimetypes.guess_file_type(str(myfile))[0]
|
|
59
|
+
|
|
60
|
+
self.output_dir= output_dir
|
|
61
|
+
self.input_filepath= myfile
|
|
62
|
+
base_name, _ = os.path.splitext(myfile.name)
|
|
63
|
+
self.json_filepath = output_dir / 'json' / (base_name + ".json")
|
|
64
|
+
|
|
65
|
+
if mt == 'application/json':
|
|
66
|
+
print("the file was json")
|
|
67
|
+
elif mt == 'application/pdf':
|
|
68
|
+
print("the file was pdf, outputting in: ", output_dir)
|
|
69
|
+
extractor= MarkdownPDFExtractor(str(myfile), output_path= str(self.output_dir), page_delimiter= "-- NEXT PAGE --")
|
|
70
|
+
extractor.extract()
|
|
71
|
+
|
|
72
|
+
elif mt in EXCEL_FILE_TYPES:
|
|
73
|
+
self.convert_file_to_json()
|
|
74
|
+
|
|
75
|
+
else:
|
|
76
|
+
print(mt)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def convert_file_to_json(
|
|
81
|
+
self,
|
|
82
|
+
sheet_to_convert: Union[str, int, None] = None, # Relevant for Excel/ODS
|
|
83
|
+
orient: Literal['dict', 'list', 'series', 'split', 'records', 'index'] = 'records', # Corrected type hint
|
|
84
|
+
date_format: Union[str, None] = 'iso', # 'iso', 'epoch', or None
|
|
85
|
+
csv_encoding: str = 'utf-8', # For reading CSV files
|
|
86
|
+
excel_ods_engine: PandasReadEngineType = None # For Excel/ODS, e.g., 'openpyxl', 'xlrd', 'odf'
|
|
87
|
+
) -> bool:
|
|
88
|
+
"""
|
|
89
|
+
Converts an Excel, ODS, or CSV file (or a specific Excel/ODS sheet)
|
|
90
|
+
into an equivalent JSON format.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
sheet_to_convert (str | int | None, optional):
|
|
94
|
+
- For Excel/ODS:
|
|
95
|
+
- If None (default): Converts all sheets. The JSON output will be a
|
|
96
|
+
dictionary where keys are sheet names and values are the JSON
|
|
97
|
+
representation of each sheet.
|
|
98
|
+
- If str: Name of the specific sheet to convert.
|
|
99
|
+
- If int: Index of the specific sheet to convert (0-based).
|
|
100
|
+
If a specific sheet is requested, the JSON output will directly be
|
|
101
|
+
the representation of that sheet.
|
|
102
|
+
- For CSV: This parameter is ignored. The entire CSV is processed.
|
|
103
|
+
orient (str, optional): Pandas DataFrame.to_dict() orientation for each sheet/CSV.
|
|
104
|
+
Default: 'records'. See pandas.DataFrame.to_dict() documentation.
|
|
105
|
+
date_format (str | None, optional): Format for datetime objects.
|
|
106
|
+
- 'iso' (default): ISO8601 format (e.g., '2023-10-27T10:30:00').
|
|
107
|
+
- 'epoch': Milliseconds since epoch.
|
|
108
|
+
- None: Pandas default (often Timestamps). 'iso' is generally safer for JSON.
|
|
109
|
+
csv_encoding (str, optional): Encoding for reading CSV files. Default is 'utf-8'.
|
|
110
|
+
excel_ods_engine (str | None, optional): Pandas engine for reading Excel or ODS files.
|
|
111
|
+
- For Excel: 'openpyxl' (for .xlsx), 'xlrd' (for .xls).
|
|
112
|
+
- For ODS: 'odf' (requires 'odfpy' library).
|
|
113
|
+
If None, pandas auto-detects based on file extension and installed libraries.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
bool: True if conversion was successful, False otherwise.
|
|
117
|
+
"""
|
|
118
|
+
input_filepath = self.input_filepath
|
|
119
|
+
json_filepath = self.json_filepath
|
|
120
|
+
|
|
121
|
+
try:
|
|
122
|
+
|
|
123
|
+
if not input_filepath.exists():
|
|
124
|
+
print(f"Error: Input file not found at {input_filepath}")
|
|
125
|
+
return False
|
|
126
|
+
|
|
127
|
+
# Ensure output directory exists
|
|
128
|
+
json_filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
129
|
+
|
|
130
|
+
file_suffix = input_filepath.suffix.lower()
|
|
131
|
+
output_data_final: Union[Dict[str, Any], List[Dict[str, Any]]] = {}
|
|
132
|
+
|
|
133
|
+
dataframes_to_process: list[tuple[pd.DataFrame, str | None]] = []
|
|
134
|
+
|
|
135
|
+
current_engine: PandasReadEngineType = excel_ods_engine
|
|
136
|
+
|
|
137
|
+
if file_suffix == '.csv':
|
|
138
|
+
if sheet_to_convert is not None:
|
|
139
|
+
print(f"Info: 'sheet_to_convert' parameter ('{sheet_to_convert}') is ignored for CSV file '{input_filepath.name}'. Processing entire CSV.")
|
|
140
|
+
try:
|
|
141
|
+
df = pd.read_csv(input_filepath, encoding=csv_encoding)
|
|
142
|
+
dataframes_to_process.append((df, None))
|
|
143
|
+
except Exception as e:
|
|
144
|
+
print(f"Error reading CSV file '{input_filepath.name}': {e}")
|
|
145
|
+
return False
|
|
146
|
+
|
|
147
|
+
elif file_suffix in ['.xls', '.xlsx', '.ods']:
|
|
148
|
+
try:
|
|
149
|
+
if file_suffix == '.ods':
|
|
150
|
+
if current_engine is None:
|
|
151
|
+
current_engine = 'odf'
|
|
152
|
+
elif current_engine != 'odf':
|
|
153
|
+
print(f"Warning: Specified engine '{current_engine}' may not be optimal for ODS. Forcing 'odf'.")
|
|
154
|
+
current_engine = 'odf'
|
|
155
|
+
|
|
156
|
+
if sheet_to_convert is not None:
|
|
157
|
+
df = pd.read_excel(input_filepath, sheet_name=sheet_to_convert, engine=current_engine)
|
|
158
|
+
dataframes_to_process.append((df, None))
|
|
159
|
+
|
|
160
|
+
else:
|
|
161
|
+
excel_file = pd.ExcelFile(input_filepath, engine=current_engine)
|
|
162
|
+
if not excel_file.sheet_names:
|
|
163
|
+
print(f"Warning: File '{input_filepath.name}' contains no sheets.")
|
|
164
|
+
for sheet_name in excel_file.sheet_names:
|
|
165
|
+
df = excel_file.parse(sheet_name) # engine is inherited
|
|
166
|
+
dataframes_to_process.append((df, sheet_name))
|
|
167
|
+
except ImportError as ie:
|
|
168
|
+
if 'odfpy' in str(ie).lower() and file_suffix == '.ods':
|
|
169
|
+
print(f"Error reading ODS file '{input_filepath.name}': The 'odfpy' library is required. Please install it using 'pip install odfpy'.")
|
|
170
|
+
elif 'xlrd' in str(ie).lower() and file_suffix == '.xls':
|
|
171
|
+
print(f"Error reading .xls file '{input_filepath.name}': The 'xlrd' library might be required. Please install it using 'pip install xlrd'.")
|
|
172
|
+
elif 'openpyxl' in str(ie).lower() and file_suffix == '.xlsx':
|
|
173
|
+
print(f"Error reading .xlsx file '{input_filepath.name}': The 'openpyxl' library might be required. Please install it using 'pip install openpyxl'.")
|
|
174
|
+
else:
|
|
175
|
+
print(f"ImportError reading file '{input_filepath.name}': {ie}")
|
|
176
|
+
return False
|
|
177
|
+
except Exception as e:
|
|
178
|
+
print(f"Error reading Excel/ODS file '{input_filepath.name}': {e}")
|
|
179
|
+
return False
|
|
180
|
+
else:
|
|
181
|
+
print(f"Error: Unsupported file type: '{file_suffix}'. Please provide a CSV, XLS, XLSX, or ODS file.")
|
|
182
|
+
return False
|
|
183
|
+
|
|
184
|
+
if not dataframes_to_process and file_suffix in ['.xls', '.xlsx', '.ods'] and sheet_to_convert is None:
|
|
185
|
+
print(f"Info: No dataframes were loaded from '{input_filepath.name}'. Output JSON will be empty if processing all sheets from an empty file.")
|
|
186
|
+
elif not dataframes_to_process and not (file_suffix in ['.xls', '.xlsx', '.ods'] and sheet_to_convert is None):
|
|
187
|
+
pass
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
is_direct_output = len(dataframes_to_process) == 1 and dataframes_to_process[0][1] is None
|
|
191
|
+
temp_processed_data: Dict[str, Any] = {}
|
|
192
|
+
|
|
193
|
+
for df_original, name_key in dataframes_to_process:
|
|
194
|
+
df = df_original.copy()
|
|
195
|
+
|
|
196
|
+
if date_format:
|
|
197
|
+
for col_name in df.select_dtypes(include=['datetime64[ns]', 'datetime', 'datetimetz']).columns:
|
|
198
|
+
try:
|
|
199
|
+
if date_format == 'iso':
|
|
200
|
+
df[col_name] = df[col_name].apply(lambda x: x.isoformat() if pd.notnull(x) and hasattr(x, 'isoformat') else None)
|
|
201
|
+
elif date_format == 'epoch':
|
|
202
|
+
df[col_name] = df[col_name].apply(lambda x: int(x.timestamp() * 1000) if pd.notnull(x) and hasattr(x, 'timestamp') else None)
|
|
203
|
+
except Exception as e_date:
|
|
204
|
+
print(f"Warning: Could not fully convert date column '{col_name}' in '{name_key or input_filepath.name}' using format '{date_format}'. Error: {e_date}. Problematic values might be None.")
|
|
205
|
+
|
|
206
|
+
df = df.astype(object).where(pd.notnull(df), None)
|
|
207
|
+
current_json_segment = df.to_dict(orient=orient)
|
|
208
|
+
|
|
209
|
+
if is_direct_output:
|
|
210
|
+
output_data_final = current_json_segment
|
|
211
|
+
break
|
|
212
|
+
else:
|
|
213
|
+
if name_key is not None:
|
|
214
|
+
temp_processed_data[name_key] = current_json_segment
|
|
215
|
+
|
|
216
|
+
if not is_direct_output:
|
|
217
|
+
output_data_final = temp_processed_data
|
|
218
|
+
|
|
219
|
+
with open(json_filepath, 'w', encoding='utf-8') as f:
|
|
220
|
+
json.dump(output_data_final, f, indent=4, ensure_ascii=False)
|
|
221
|
+
|
|
222
|
+
print(f"Successfully converted '{input_filepath.name}' to '{json_filepath.name}'")
|
|
223
|
+
return True
|
|
224
|
+
|
|
225
|
+
except FileNotFoundError:
|
|
226
|
+
print(f"Error: Input file not found at {input_filepath.name}")
|
|
227
|
+
return False
|
|
228
|
+
except ValueError as ve:
|
|
229
|
+
print(f"ValueError during conversion of '{input_filepath.name}': {ve}")
|
|
230
|
+
return False
|
|
231
|
+
except Exception as e:
|
|
232
|
+
print(f"An unexpected error occurred during conversion of '{input_filepath.name}': {e}")
|
|
233
|
+
return False
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def chunk_text(text, chunk_size=500, overlap_size=50):
|
|
237
|
+
"""
|
|
238
|
+
Chunks a given text into smaller pieces with optional overlap.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
text (str): The input text to be chunked.
|
|
242
|
+
chunk_size (int): The maximum size of each chunk (in characters).
|
|
243
|
+
overlap_size (int): The number of characters to overlap between consecutive chunks.
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
list: A list of text chunks.
|
|
247
|
+
"""
|
|
248
|
+
chunks = []
|
|
249
|
+
start = 0
|
|
250
|
+
while start < len(text):
|
|
251
|
+
end = start + chunk_size
|
|
252
|
+
chunk = text[start:end]
|
|
253
|
+
chunks.append(chunk)
|
|
254
|
+
start += (chunk_size - overlap_size)
|
|
255
|
+
if start < 0: # Handle cases where overlap_size is greater than chunk_size
|
|
256
|
+
start = 0
|
|
257
|
+
return chunks
|
|
258
|
+
|
|
259
|
+
if __name__ == '__main__':
|
|
260
|
+
print("do you want a rice bag?")
|