hjxdl 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hdl/_version.py +2 -2
- hdl/datasets/city_code.json +2576 -0
- hdl/datasets/defined_BaseFeatures.fdef +236 -0
- hdl/datasets/las.tsv +0 -0
- hdl/datasets/route_template.json +113 -0
- hdl/datasets/vocab.txt +591 -0
- hdl/ju/__init__.py +0 -0
- hdl/ju/setup.py +55 -0
- hdl/jupyfuncs/__init__.py +0 -0
- hdl/jupyfuncs/chem/__init__.py +0 -0
- hdl/jupyfuncs/chem/mol.py +548 -0
- hdl/jupyfuncs/chem/norm.py +268 -0
- hdl/jupyfuncs/chem/pdb_ext.py +94 -0
- hdl/jupyfuncs/chem/scaffold.py +25 -0
- hdl/jupyfuncs/chem/shape.py +241 -0
- hdl/jupyfuncs/chem/tokenizers.py +2 -0
- hdl/jupyfuncs/dbtools/__init__.py +0 -0
- hdl/jupyfuncs/dbtools/pg.py +42 -0
- hdl/jupyfuncs/dbtools/query_info.py +150 -0
- hdl/jupyfuncs/dl/__init__.py +0 -0
- hdl/jupyfuncs/dl/cp.py +54 -0
- hdl/jupyfuncs/dl/dataframe.py +38 -0
- hdl/jupyfuncs/dl/fp.py +49 -0
- hdl/jupyfuncs/dl/list.py +20 -0
- hdl/jupyfuncs/dl/model_utils.py +97 -0
- hdl/jupyfuncs/dl/tensor.py +159 -0
- hdl/jupyfuncs/dl/uncs.py +112 -0
- hdl/jupyfuncs/llm/__init__.py +0 -0
- hdl/jupyfuncs/llm/extract.py +123 -0
- hdl/jupyfuncs/llm/openapi.py +94 -0
- hdl/jupyfuncs/network/__init__.py +0 -0
- hdl/jupyfuncs/network/proxy.py +20 -0
- hdl/jupyfuncs/path/__init__.py +0 -0
- hdl/jupyfuncs/path/glob.py +285 -0
- hdl/jupyfuncs/path/strings.py +65 -0
- hdl/jupyfuncs/show/__init__.py +0 -0
- hdl/jupyfuncs/show/pbar.py +50 -0
- hdl/jupyfuncs/show/plot.py +259 -0
- hdl/jupyfuncs/utils/__init__.py +0 -0
- hdl/jupyfuncs/utils/wrappers.py +8 -0
- hdl/utils/llm/chat.py +4 -0
- {hjxdl-0.1.12.dist-info → hjxdl-0.1.14.dist-info}/METADATA +1 -1
- {hjxdl-0.1.12.dist-info → hjxdl-0.1.14.dist-info}/RECORD +45 -6
- {hjxdl-0.1.12.dist-info → hjxdl-0.1.14.dist-info}/WHEEL +1 -1
- {hjxdl-0.1.12.dist-info → hjxdl-0.1.14.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,123 @@
|
|
1
|
+
import pdfplumber
|
2
|
+
import pytesseract
|
3
|
+
from PIL import Image
|
4
|
+
import pandas as pd
|
5
|
+
import io
|
6
|
+
from spire.doc import Document
|
7
|
+
from spire.doc.common import *
|
8
|
+
# from ..path.glob import (
|
9
|
+
# get_current_dir,
|
10
|
+
# get_files
|
11
|
+
# )
|
12
|
+
|
13
|
+
|
14
|
+
class DocExtractor():
|
15
|
+
def __init__(
|
16
|
+
self,
|
17
|
+
doc_files: list,
|
18
|
+
lang: str = "chi_sim"
|
19
|
+
) -> None:
|
20
|
+
self.doc_files = doc_files
|
21
|
+
self.lang = lang
|
22
|
+
|
23
|
+
@classmethod
|
24
|
+
def text_from_doc(
|
25
|
+
doc_path
|
26
|
+
):
|
27
|
+
document = Document()
|
28
|
+
# Load a Word document
|
29
|
+
document.LoadFromFile(doc_path)
|
30
|
+
document_text = document.GetText()
|
31
|
+
return document_text
|
32
|
+
|
33
|
+
@staticmethod
|
34
|
+
def text_from_plain(
|
35
|
+
txt_path
|
36
|
+
):
|
37
|
+
with open(txt_path, "r") as f:
|
38
|
+
text = f.read()
|
39
|
+
return text
|
40
|
+
|
41
|
+
@staticmethod
|
42
|
+
def extract_text_from_image(
|
43
|
+
image: Image.Image,
|
44
|
+
) -> str:
|
45
|
+
return pytesseract.image_to_string(image, lang=self.lang)
|
46
|
+
|
47
|
+
@staticmethod
|
48
|
+
def is_within_bbox(
|
49
|
+
bbox1, bbox2
|
50
|
+
):
|
51
|
+
"""Check if bbox1 is within bbox2."""
|
52
|
+
return bbox1[0] >= bbox2[0] and bbox1[1] >= bbox2[1] and bbox1[2] <= bbox2[2] and bbox1[3] <= bbox2[3]
|
53
|
+
|
54
|
+
def text_tables_from_pdf(
|
55
|
+
self,
|
56
|
+
pdf_path,
|
57
|
+
table_from_pic: bool = False
|
58
|
+
):
|
59
|
+
all_tables = []
|
60
|
+
all_texts = []
|
61
|
+
with pdfplumber.open(pdf_path) as pdf:
|
62
|
+
for page_number, page in enumerate(pdf.pages):
|
63
|
+
tables = page.find_tables()
|
64
|
+
page_text = page.extract_text(x_tolerance=0.1, y_tolerance=0.1) or ''
|
65
|
+
page_text_lines = page_text.split('\n')
|
66
|
+
|
67
|
+
# Extract tables
|
68
|
+
if tables:
|
69
|
+
for table in tables:
|
70
|
+
if table and len(table.extract()) > 1:
|
71
|
+
table_data = table.extract()
|
72
|
+
df = pd.DataFrame(table_data[1:], columns=table_data[0])
|
73
|
+
df['Page'] = page_number + 1 # 添加页码信息
|
74
|
+
all_tables.append(df)
|
75
|
+
|
76
|
+
# Get bounding boxes for tables
|
77
|
+
table_bboxes = [table.bbox for table in tables]
|
78
|
+
|
79
|
+
# Filter out text within table bounding boxes
|
80
|
+
non_table_text = []
|
81
|
+
for char in page.chars:
|
82
|
+
char_bbox = (char['x0'], char['top'], char['x1'], char['bottom'])
|
83
|
+
if not any(self.is_within_bbox(char_bbox, table_bbox) for table_bbox in table_bboxes):
|
84
|
+
non_table_text.append(char['text'])
|
85
|
+
remaining_text = ''.join(non_table_text).strip()
|
86
|
+
if remaining_text:
|
87
|
+
all_texts.append(remaining_text)
|
88
|
+
|
89
|
+
# Extract tables from images if specified
|
90
|
+
if table_from_pic:
|
91
|
+
for img in page.images:
|
92
|
+
try:
|
93
|
+
x0, top, x1, bottom = img["x0"], img["top"], img["x1"], img["bottom"]
|
94
|
+
if x0 < 0 or top < 0 or x1 > page.width or bottom > page.height:
|
95
|
+
print(f"Skipping image with invalid bounds on page {page_number + 1}")
|
96
|
+
continue
|
97
|
+
|
98
|
+
cropped_image = page.within_bbox((x0, top, x1, bottom)).to_image()
|
99
|
+
img_bytes = io.BytesIO()
|
100
|
+
cropped_image.save(img_bytes, format='PNG')
|
101
|
+
img_bytes.seek(0)
|
102
|
+
pil_image = Image.open(img_bytes)
|
103
|
+
|
104
|
+
ocr_text = self.extract_text_from_image(pil_image, lang=self.lang)
|
105
|
+
|
106
|
+
table = [line.split() for line in ocr_text.split('\n') if line.strip()]
|
107
|
+
|
108
|
+
if table:
|
109
|
+
num_columns = max(len(row) for row in table)
|
110
|
+
for row in table:
|
111
|
+
if len(row) != num_columns:
|
112
|
+
row.extend([''] * (num_columns - len(row)))
|
113
|
+
|
114
|
+
df = pd.DataFrame(table[1:], columns=table[0])
|
115
|
+
df['Page'] = page_number + 1
|
116
|
+
all_tables.append(df)
|
117
|
+
except Exception as e:
|
118
|
+
print(f"Error processing image on page {page_number + 1}: {e}")
|
119
|
+
|
120
|
+
if all_tables:
|
121
|
+
return all_texts, all_tables
|
122
|
+
else:
|
123
|
+
return all_texts, [pd.DataFrame()]
|
@@ -0,0 +1,94 @@
|
|
1
|
+
from openai import OpenAI
|
2
|
+
# import traceback
|
3
|
+
|
4
|
+
|
5
|
+
def chat_oai_stream(
|
6
|
+
base_url="http://127.0.0.1:8000/v1",
|
7
|
+
api_key="dummy_key",
|
8
|
+
model="/data/models/Qwen-7B-Chat-Int4",
|
9
|
+
prompt="Who are you?",
|
10
|
+
*args,
|
11
|
+
**kwargs
|
12
|
+
):
|
13
|
+
"""Chat with OpenAI's GPT-3 model using the specified parameters.
|
14
|
+
|
15
|
+
Args:
|
16
|
+
base_url (str): The base URL for the OpenAI API. Default is "http://127.0.0.1:8000/v1".
|
17
|
+
api_key (str): The API key for accessing the OpenAI API. Default is "dummy_key".
|
18
|
+
model (str): The model ID to use for the chat. Default is "/data/models/Qwen-7B-Chat-Int4".
|
19
|
+
prompt (str): The initial prompt for the chat conversation.
|
20
|
+
|
21
|
+
Yields:
|
22
|
+
str: The generated content from the chat conversation.
|
23
|
+
|
24
|
+
"""
|
25
|
+
client = OpenAI(
|
26
|
+
base_url=base_url,
|
27
|
+
api_key=api_key,
|
28
|
+
)
|
29
|
+
response = client.chat.completions.create(
|
30
|
+
model=model,
|
31
|
+
messages=[{
|
32
|
+
"role": "user",
|
33
|
+
"content": prompt
|
34
|
+
}],
|
35
|
+
stream=True,
|
36
|
+
*args,
|
37
|
+
**kwargs
|
38
|
+
)
|
39
|
+
|
40
|
+
for chunk in response:
|
41
|
+
content = chunk.choices[0].delta.content
|
42
|
+
yield content
|
43
|
+
|
44
|
+
|
45
|
+
def chat_oai_invoke(
|
46
|
+
base_url="http://127.0.0.1:8000/v1",
|
47
|
+
api_key="dummy_key",
|
48
|
+
model="/data/models/Qwen-7B-Chat-Int4",
|
49
|
+
prompt="Who are you?",
|
50
|
+
):
|
51
|
+
"""Invoke OpenAI chat API to generate a response based on the given prompt.
|
52
|
+
|
53
|
+
Args:
|
54
|
+
base_url (str): The base URL of the OpenAI API. Default is "http://127.0.0.1:8000/v1".
|
55
|
+
api_key (str): The API key for accessing the OpenAI API. Default is "dummy_key".
|
56
|
+
model (str): The model to use for generating the response. Default is "/data/models/Qwen-7B-Chat-Int4".
|
57
|
+
prompt (str): The prompt message to start the conversation. Default is "Who are you?".
|
58
|
+
|
59
|
+
Returns:
|
60
|
+
str: The response generated by the OpenAI chat API based on the prompt.
|
61
|
+
"""
|
62
|
+
client = OpenAI(
|
63
|
+
base_url=base_url,
|
64
|
+
api_key=api_key,
|
65
|
+
)
|
66
|
+
response = client.chat.completions.create(
|
67
|
+
model=model,
|
68
|
+
messages=[{
|
69
|
+
"role": "user",
|
70
|
+
"content": prompt
|
71
|
+
}],
|
72
|
+
stream=False,
|
73
|
+
*args,
|
74
|
+
**kwargs
|
75
|
+
)
|
76
|
+
|
77
|
+
return response.choices[0].message.content
|
78
|
+
|
79
|
+
# client = OpenAI(
|
80
|
+
# base_url="http://127.0.0.1:8000/v1",
|
81
|
+
# api_key="dummy_key" # 使用虚拟的API Key
|
82
|
+
# )
|
83
|
+
|
84
|
+
|
85
|
+
# response = client.chat.completions.create(
|
86
|
+
# model="/data/models/Qwen-7B-Chat-Int4",
|
87
|
+
# messages=[{"role": "user", "content": "给我讲个鬼故事"}],
|
88
|
+
# stream=True
|
89
|
+
# )
|
90
|
+
# for chunk in response:
|
91
|
+
# content = chunk.choices[0].delta.content
|
92
|
+
# if content:
|
93
|
+
# print(content, end='', flush=True)
|
94
|
+
# print()
|
File without changes
|
@@ -0,0 +1,20 @@
|
|
1
|
+
import requests
|
2
|
+
|
3
|
+
|
4
|
+
def get_proxies(
|
5
|
+
pool_server='http://172.20.0.9:5010',
|
6
|
+
https=False
|
7
|
+
):
|
8
|
+
http_proxy = requests.get(f"{pool_server}/get/").json().get("proxy")
|
9
|
+
https_proxy = requests.get(f"{pool_server}/get/?type=https").json().get("proxy")
|
10
|
+
if https:
|
11
|
+
proxy_dict = {
|
12
|
+
'http': f'http://{https_proxy}',
|
13
|
+
'https': f'https://{https_proxy}'
|
14
|
+
}
|
15
|
+
else:
|
16
|
+
proxy_dict = {
|
17
|
+
'http': f'http://{http_proxy}',
|
18
|
+
'https': f'https://{http_proxy}'
|
19
|
+
}
|
20
|
+
return proxy_dict
|
File without changes
|
@@ -0,0 +1,285 @@
|
|
1
|
+
import os
|
2
|
+
import typing as t
|
3
|
+
import inspect
|
4
|
+
import fnmatch
|
5
|
+
import linecache
|
6
|
+
import time
|
7
|
+
import gc
|
8
|
+
import psutil
|
9
|
+
from os import path as osp
|
10
|
+
import pathlib
|
11
|
+
import sys
|
12
|
+
import importlib
|
13
|
+
import subprocess
|
14
|
+
import re
|
15
|
+
|
16
|
+
import multiprocess as mp
|
17
|
+
|
18
|
+
import importlib.resources as pkg_resources
|
19
|
+
import json
|
20
|
+
|
21
|
+
|
22
|
+
def in_jupyter():
|
23
|
+
"""Check if the code is running in a Jupyter notebook.
|
24
|
+
|
25
|
+
Returns:
|
26
|
+
bool: True if running in Jupyter notebook, False otherwise.
|
27
|
+
"""
|
28
|
+
|
29
|
+
which = True if 'ipykernel_launcher.py' in sys.argv[0] else False
|
30
|
+
return which
|
31
|
+
|
32
|
+
|
33
|
+
def in_docker():
|
34
|
+
"""Check if the code is running inside a Docker container.
|
35
|
+
|
36
|
+
Returns:
|
37
|
+
bool: True if running inside a Docker container, False otherwise.
|
38
|
+
"""
|
39
|
+
return osp.exists('/.dockerenv')
|
40
|
+
|
41
|
+
|
42
|
+
def get_files(
|
43
|
+
dir_path,
|
44
|
+
file_types: list = ["txt"]
|
45
|
+
):
|
46
|
+
"""Get a list of files with specific file extensions in the given directory path.
|
47
|
+
|
48
|
+
Args:
|
49
|
+
dir_path (str): The path to the target directory.
|
50
|
+
|
51
|
+
Returns:
|
52
|
+
list: A list of absolute file paths that have file extensions such as .md, .doc, .docx, .pdf, .csv, or .txt.
|
53
|
+
"""
|
54
|
+
# args:dir_path,目标文件夹路径
|
55
|
+
file_list = []
|
56
|
+
for filepath, dirnames, filenames in os.walk(dir_path):
|
57
|
+
# os.walk 函数将递归遍历指定文件夹
|
58
|
+
filenames = [f for f in filenames if not f[0] == '.']
|
59
|
+
dirnames[:] = [d for d in dirnames if not d[0] == '.']
|
60
|
+
for filename in filenames:
|
61
|
+
# 通过后缀名判断文件类型是否满足要求
|
62
|
+
if filename.endswith(file_types):
|
63
|
+
# 如果满足要求,将其绝对路径加入到结果列表
|
64
|
+
file_list.append(os.path.join(filepath, filename))
|
65
|
+
return file_list
|
66
|
+
|
67
|
+
|
68
|
+
def get_dataset_file(filename):
|
69
|
+
"""Get dataset file.
|
70
|
+
|
71
|
+
Args:
|
72
|
+
filename (str): The name of the dataset file.
|
73
|
+
|
74
|
+
Returns:
|
75
|
+
dict: The data loaded from the dataset file.
|
76
|
+
"""
|
77
|
+
with pkg_resources.path('jupyfuncs.datasets', filename) as file_path:
|
78
|
+
with open(file_path, 'r') as f:
|
79
|
+
data = json.load(f)
|
80
|
+
return data
|
81
|
+
|
82
|
+
|
83
|
+
def recursive_glob(treeroot, pattern):
|
84
|
+
"""Recursively searches for files matching a specified pattern starting from the given directory.
|
85
|
+
|
86
|
+
Args:
|
87
|
+
treeroot (str): The root directory to start the search from.
|
88
|
+
pattern (str): The pattern to match the files against.
|
89
|
+
|
90
|
+
Returns:
|
91
|
+
list: A list of file paths that match the specified pattern.
|
92
|
+
"""
|
93
|
+
results = []
|
94
|
+
for base, dirs, files in os.walk(treeroot):
|
95
|
+
goodfiles = fnmatch.filter(files, pattern)
|
96
|
+
results.extend(os.path.join(base, f) for f in goodfiles)
|
97
|
+
return results
|
98
|
+
|
99
|
+
|
100
|
+
def makedirs(path: str, isfile: bool = False) -> None:
|
101
|
+
"""Creates a directory given a path to either a directory or file.
|
102
|
+
If a directory is provided, creates that directory. If a file is provided (i.e. :code:`isfile == True`),
|
103
|
+
creates the parent directory for that file.
|
104
|
+
|
105
|
+
|
106
|
+
Args:
|
107
|
+
path (str): Path to a directory or file.
|
108
|
+
isfile (bool, optional): Whether the provided path is a directory or file.Defaults to False.
|
109
|
+
"""
|
110
|
+
if isfile:
|
111
|
+
path = os.path.dirname(path)
|
112
|
+
if path != '':
|
113
|
+
os.makedirs(path, exist_ok=True)
|
114
|
+
|
115
|
+
|
116
|
+
def get_current_dir():
|
117
|
+
"""Return the current directory path."""
|
118
|
+
return os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
|
119
|
+
|
120
|
+
|
121
|
+
def get_num_lines(file):
|
122
|
+
"""Get the number of lines in a file.
|
123
|
+
|
124
|
+
Args:
|
125
|
+
file (str): The path to the file.
|
126
|
+
|
127
|
+
Returns:
|
128
|
+
int: The number of lines in the file.
|
129
|
+
"""
|
130
|
+
num_lines = subprocess.check_output(
|
131
|
+
['wc', '-l', file]
|
132
|
+
).split()[0]
|
133
|
+
return int(num_lines)
|
134
|
+
|
135
|
+
|
136
|
+
|
137
|
+
|
138
|
+
def chunkify_file(
|
139
|
+
fname,
|
140
|
+
size=1024 * 1024 * 1000,
|
141
|
+
skiplines=-1
|
142
|
+
):
|
143
|
+
"""
|
144
|
+
function to divide a large text file into chunks each having size ~= size so that the chunks are line aligned
|
145
|
+
|
146
|
+
Params :
|
147
|
+
fname : path to the file to be chunked
|
148
|
+
size : size of each chink is ~> this
|
149
|
+
skiplines : number of lines in the begining to skip, -1 means don't skip any lines
|
150
|
+
Returns :
|
151
|
+
start and end position of chunks in Bytes
|
152
|
+
"""
|
153
|
+
chunks = []
|
154
|
+
fileEnd = os.path.getsize(fname)
|
155
|
+
with open(fname, "rb") as f:
|
156
|
+
if(skiplines > 0):
|
157
|
+
for i in range(skiplines):
|
158
|
+
f.readline()
|
159
|
+
|
160
|
+
chunkEnd = f.tell()
|
161
|
+
count = 0
|
162
|
+
while True:
|
163
|
+
chunkStart = chunkEnd
|
164
|
+
f.seek(f.tell() + size, os.SEEK_SET)
|
165
|
+
f.readline() # make this chunk line aligned
|
166
|
+
chunkEnd = f.tell()
|
167
|
+
chunks.append((chunkStart, chunkEnd - chunkStart, fname))
|
168
|
+
count += 1
|
169
|
+
|
170
|
+
if chunkEnd > fileEnd:
|
171
|
+
break
|
172
|
+
return chunks
|
173
|
+
|
174
|
+
|
175
|
+
def parallel_apply_line_by_line_chunk(chunk_data):
|
176
|
+
"""
|
177
|
+
function to apply a function to each line in a chunk
|
178
|
+
|
179
|
+
Params :
|
180
|
+
chunk_data : the data for this chunk
|
181
|
+
Returns :
|
182
|
+
list of the non-None results for this chunk
|
183
|
+
"""
|
184
|
+
chunk_start, chunk_size, file_path, func_apply = chunk_data[:4]
|
185
|
+
func_args = chunk_data[4:]
|
186
|
+
|
187
|
+
# t1 = time.time()
|
188
|
+
chunk_res = []
|
189
|
+
with open(file_path, "rb") as f:
|
190
|
+
f.seek(chunk_start)
|
191
|
+
cont = f.read(chunk_size).decode(encoding='utf-8')
|
192
|
+
lines = cont.splitlines()
|
193
|
+
|
194
|
+
for _, line in enumerate(lines):
|
195
|
+
ret = func_apply(line, *func_args)
|
196
|
+
if(ret != None):
|
197
|
+
chunk_res.append(ret)
|
198
|
+
return chunk_res
|
199
|
+
|
200
|
+
|
201
|
+
def parallel_apply_line_by_line(
|
202
|
+
input_file_path,
|
203
|
+
chunk_size_factor,
|
204
|
+
num_procs,
|
205
|
+
skiplines,
|
206
|
+
func_apply,
|
207
|
+
func_args,
|
208
|
+
fout=None
|
209
|
+
):
|
210
|
+
"""
|
211
|
+
function to apply a supplied function line by line in parallel
|
212
|
+
|
213
|
+
Params :
|
214
|
+
input_file_path : path to input file
|
215
|
+
chunk_size_factor : size of 1 chunk in MB
|
216
|
+
num_procs : number of parallel processes to spawn, max used is num of available cores - 1
|
217
|
+
skiplines : number of top lines to skip while processing
|
218
|
+
func_apply : a function which expects a line and outputs None for lines we don't want processed
|
219
|
+
func_args : arguments to function func_apply
|
220
|
+
fout : do we want to output the processed lines to a file
|
221
|
+
Returns :
|
222
|
+
list of the non-None results obtained be processing each line
|
223
|
+
"""
|
224
|
+
num_parallel = min(num_procs, psutil.cpu_count()) - 1
|
225
|
+
|
226
|
+
jobs = chunkify_file(input_file_path, 1024 * 1024 * chunk_size_factor, skiplines)
|
227
|
+
|
228
|
+
jobs = [list(x) + [func_apply] + func_args for x in jobs]
|
229
|
+
|
230
|
+
print("Starting the parallel pool for {} jobs ".format(len(jobs)))
|
231
|
+
|
232
|
+
lines_counter = 0
|
233
|
+
|
234
|
+
pool = mp.Pool(num_parallel, maxtasksperchild=1000) # maxtaskperchild - if not supplied some weird happend and memory blows as the processes keep on lingering
|
235
|
+
|
236
|
+
outputs = []
|
237
|
+
for i in range(0, len(jobs), num_parallel):
|
238
|
+
print("Chunk start = ", i)
|
239
|
+
t1 = time.time()
|
240
|
+
chunk_outputs = pool.map(
|
241
|
+
parallel_apply_line_by_line_chunk,
|
242
|
+
jobs[i: i + num_parallel]
|
243
|
+
)
|
244
|
+
|
245
|
+
for i, subl in enumerate(chunk_outputs):
|
246
|
+
for x in subl:
|
247
|
+
if(fout != None):
|
248
|
+
print(x, file=fout)
|
249
|
+
else:
|
250
|
+
outputs.append(x)
|
251
|
+
lines_counter += 1
|
252
|
+
del(chunk_outputs)
|
253
|
+
gc.collect()
|
254
|
+
print("All Done in time ", time.time() - t1)
|
255
|
+
|
256
|
+
print("Total lines we have = {}".format(lines_counter))
|
257
|
+
|
258
|
+
pool.close()
|
259
|
+
pool.terminate()
|
260
|
+
return outputs
|
261
|
+
|
262
|
+
|
263
|
+
def get_func_from_dir(score_dir: str) -> t.Tuple[t.Callable, str]:
|
264
|
+
"""Get function and mode from directory.
|
265
|
+
|
266
|
+
Args:
|
267
|
+
score_dir (str): The directory path containing the function file.
|
268
|
+
|
269
|
+
Returns:
|
270
|
+
Tuple[Callable, str]: A tuple containing the main function and the mode.
|
271
|
+
"""
|
272
|
+
if score_dir.endswith('.py'):
|
273
|
+
func_dir = pathlib.Path(score_dir).parent.resolve()
|
274
|
+
file_name = pathlib.Path(score_dir).stem
|
275
|
+
else:
|
276
|
+
func_dir = osp.abspath(score_dir)
|
277
|
+
file_name = "main"
|
278
|
+
|
279
|
+
sys.path.append(func_dir)
|
280
|
+
module = importlib.import_module(file_name)
|
281
|
+
try:
|
282
|
+
mode = module.MODE
|
283
|
+
except Exception as _:
|
284
|
+
mode = 'batch'
|
285
|
+
return module.main, mode
|
@@ -0,0 +1,65 @@
|
|
1
|
+
import re
|
2
|
+
import subprocess
|
3
|
+
|
4
|
+
|
5
|
+
def get_n_tokens(
|
6
|
+
paragraph,
|
7
|
+
model: str = None
|
8
|
+
):
|
9
|
+
"""Get the number of tokens in a paragraph using a specified model.
|
10
|
+
|
11
|
+
Args:
|
12
|
+
paragraph (str): The input paragraph to tokenize.
|
13
|
+
model (str): The name of the model to use for tokenization. If None, a default CJK tokenization will be used.
|
14
|
+
|
15
|
+
Returns:
|
16
|
+
int: The number of tokens in the paragraph based on the specified model or default CJK tokenization.
|
17
|
+
"""
|
18
|
+
if model is None:
|
19
|
+
cjk_regex = re.compile(u'[\u1100-\uFFFDh]+?')
|
20
|
+
trimed_cjk = cjk_regex.sub( ' a ', paragraph, 0)
|
21
|
+
return len(trimed_cjk.split())
|
22
|
+
else:
|
23
|
+
import tiktoken
|
24
|
+
encoding = tiktoken.encoding_for_model(model)
|
25
|
+
num_tokens = len(encoding.encode(paragraph))
|
26
|
+
return num_tokens
|
27
|
+
|
28
|
+
|
29
|
+
def str_from_line(file, line, split=False):
|
30
|
+
"""Retrieve a specific line from a file and process it.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
file (str): The path to the file.
|
34
|
+
line (int): The line number to retrieve (starting from 0).
|
35
|
+
split (bool, optional): If True, split the line by space or tab and return the first element. Defaults to False.
|
36
|
+
|
37
|
+
Returns:
|
38
|
+
str: The content of the specified line from the file.
|
39
|
+
"""
|
40
|
+
smi = subprocess.check_output(
|
41
|
+
# ['sed','-n', f'{str(i+1)}p', file]
|
42
|
+
["sed", f"{str(line + 1)}q;d", file]
|
43
|
+
)
|
44
|
+
if isinstance(smi, bytes):
|
45
|
+
smi = smi.decode().strip()
|
46
|
+
if split:
|
47
|
+
if ' ' or '\t' in smi:
|
48
|
+
smi = smi.split()[0]
|
49
|
+
return smi
|
50
|
+
|
51
|
+
|
52
|
+
def splitted_strs_from_line(
|
53
|
+
file: str,
|
54
|
+
idx: int
|
55
|
+
) -> list:
|
56
|
+
"""Return a list of strings obtained by splitting the line at the specified index from the given file.
|
57
|
+
|
58
|
+
Args:
|
59
|
+
file (str): The file path.
|
60
|
+
idx (int): The index of the line to split.
|
61
|
+
|
62
|
+
Returns:
|
63
|
+
List: A list of strings obtained by splitting the line at the specified index.
|
64
|
+
"""
|
65
|
+
return str_from_line(file, idx).split()
|
File without changes
|
@@ -0,0 +1,50 @@
|
|
1
|
+
import sys
|
2
|
+
from os import path as osp
|
3
|
+
from IPython.core.display import HTML
|
4
|
+
|
5
|
+
__all__ = [
|
6
|
+
'in_jupyter',
|
7
|
+
'tqdm',
|
8
|
+
'trange',
|
9
|
+
'tnrange',
|
10
|
+
'NO_WHITE',
|
11
|
+
]
|
12
|
+
|
13
|
+
|
14
|
+
NO_WHITE = HTML("""
|
15
|
+
<style>
|
16
|
+
.jp-OutputArea-prompt:empty {
|
17
|
+
padding: 0;
|
18
|
+
border: 0;
|
19
|
+
}
|
20
|
+
</style>
|
21
|
+
""")
|
22
|
+
|
23
|
+
|
24
|
+
def in_jupyter():
|
25
|
+
"""Check if the code is running in a Jupyter notebook.
|
26
|
+
|
27
|
+
Returns:
|
28
|
+
bool: True if running in Jupyter notebook, False otherwise.
|
29
|
+
"""
|
30
|
+
|
31
|
+
which = True if 'ipykernel_launcher.py' in sys.argv[0] else False
|
32
|
+
return which
|
33
|
+
|
34
|
+
def in_docker():
|
35
|
+
"""Check if the code is running inside a Docker container.
|
36
|
+
|
37
|
+
Returns:
|
38
|
+
bool: True if running inside a Docker container, False otherwise.
|
39
|
+
"""
|
40
|
+
return osp.exists('/.dockerenv')
|
41
|
+
|
42
|
+
|
43
|
+
if in_jupyter():
|
44
|
+
from tqdm.notebook import tqdm
|
45
|
+
from tqdm.notebook import trange
|
46
|
+
from tqdm.notebook import tnrange
|
47
|
+
else:
|
48
|
+
from tqdm import tqdm
|
49
|
+
from tqdm import trange
|
50
|
+
from tqdm import tnrange
|