reme-ai 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- reme_ai/__init__.py +1 -1
- reme_ai/config/default.yaml +16 -0
- reme_ai/constants/common_constants.py +0 -2
- reme_ai/constants/language_constants.py +1 -1
- reme_ai/enumeration/language_enum.py +14 -0
- reme_ai/summary/task/__init__.py +0 -1
- reme_ai/summary/task/trajectory_preprocess_op.py +2 -31
- reme_ai/utils/datetime_handler.py +1 -1
- {reme_ai-0.1.3.dist-info → reme_ai-0.1.4.dist-info}/METADATA +295 -132
- {reme_ai-0.1.3.dist-info → reme_ai-0.1.4.dist-info}/RECORD +14 -16
- reme_ai/enumeration/language_constants.py +0 -215
- reme_ai/summary/task/pdf_preprocess_op_wrapper.py +0 -50
- reme_ai/utils/miner_u_pdf_processor.py +0 -726
- {reme_ai-0.1.3.dist-info → reme_ai-0.1.4.dist-info}/WHEEL +0 -0
- {reme_ai-0.1.3.dist-info → reme_ai-0.1.4.dist-info}/entry_points.txt +0 -0
- {reme_ai-0.1.3.dist-info → reme_ai-0.1.4.dist-info}/licenses/LICENSE +0 -0
- {reme_ai-0.1.3.dist-info → reme_ai-0.1.4.dist-info}/top_level.txt +0 -0
@@ -1,215 +0,0 @@
|
|
1
|
-
from memoryscope.enumeration.language_enum import LanguageEnum
|
2
|
-
|
3
|
-
# This dictionary maps languages to lists of words related to datetime expressions.
|
4
|
-
# It aids in recognizing and processing datetime mentions in text, enhancing the system's ability to understand
|
5
|
-
# temporal context across different languages.
|
6
|
-
DATATIME_WORD_LIST = {
|
7
|
-
LanguageEnum.CN: [
|
8
|
-
"天",
|
9
|
-
"周",
|
10
|
-
"月",
|
11
|
-
"年",
|
12
|
-
"星期",
|
13
|
-
"点",
|
14
|
-
"分钟",
|
15
|
-
"小时",
|
16
|
-
"秒",
|
17
|
-
"上午",
|
18
|
-
"下午",
|
19
|
-
"早上",
|
20
|
-
"早晨",
|
21
|
-
"晚上",
|
22
|
-
"中午",
|
23
|
-
"日",
|
24
|
-
"夜",
|
25
|
-
"清晨",
|
26
|
-
"傍晚",
|
27
|
-
"凌晨",
|
28
|
-
"岁",
|
29
|
-
],
|
30
|
-
LanguageEnum.EN: [
|
31
|
-
# Units of Time
|
32
|
-
"year", "yr",
|
33
|
-
"month", "mo",
|
34
|
-
"week", "wk",
|
35
|
-
"day", "d",
|
36
|
-
"hour", "hr",
|
37
|
-
"minute", "min",
|
38
|
-
"second", "sec",
|
39
|
-
|
40
|
-
# Days of the Week
|
41
|
-
"Monday", "Mon",
|
42
|
-
"Tuesday", "Tue", "Tues",
|
43
|
-
"Wednesday", "Wed",
|
44
|
-
"Thursday", "Thu", "Thur", "Thurs",
|
45
|
-
"Friday", "Fri",
|
46
|
-
"Saturday", "Sat",
|
47
|
-
"Sunday", "Sun",
|
48
|
-
|
49
|
-
# Months of the Year
|
50
|
-
"January", "Jan",
|
51
|
-
"February", "Feb",
|
52
|
-
"March", "Mar",
|
53
|
-
"April", "Apr",
|
54
|
-
"May", "May",
|
55
|
-
"June", "Jun",
|
56
|
-
"July", "Jul",
|
57
|
-
"August", "Aug",
|
58
|
-
"September", "Sep", "Sept",
|
59
|
-
"October", "Oct",
|
60
|
-
"November", "Nov",
|
61
|
-
"December", "Dec",
|
62
|
-
|
63
|
-
# Relative Time References
|
64
|
-
"Today",
|
65
|
-
"Tomorrow", "Tmrw",
|
66
|
-
"Yesterday", "Yday",
|
67
|
-
"Now",
|
68
|
-
"Morning", "AM", "a.m.",
|
69
|
-
"Afternoon", "PM", "p.m.",
|
70
|
-
"Evening",
|
71
|
-
"Night",
|
72
|
-
"Midnight",
|
73
|
-
"Noon",
|
74
|
-
|
75
|
-
# Seasonal References
|
76
|
-
"Spring",
|
77
|
-
"Summer",
|
78
|
-
"Autumn", "Fall",
|
79
|
-
"Winter",
|
80
|
-
|
81
|
-
# General Time References
|
82
|
-
"Century", "cent.",
|
83
|
-
"Decade",
|
84
|
-
"Millennium",
|
85
|
-
"Quarter", "Q1", "Q2", "Q3", "Q4",
|
86
|
-
"Semester",
|
87
|
-
"Fortnight",
|
88
|
-
"Weekend"
|
89
|
-
]
|
90
|
-
}
|
91
|
-
|
92
|
-
# A mapping of weekdays for each supported language, facilitating calendar-related operations and understanding
|
93
|
-
# within the application.
|
94
|
-
WEEKDAYS = {
|
95
|
-
LanguageEnum.CN: [
|
96
|
-
"周一",
|
97
|
-
"周二",
|
98
|
-
"周三",
|
99
|
-
"周四",
|
100
|
-
"周五",
|
101
|
-
"周六",
|
102
|
-
"周日"
|
103
|
-
],
|
104
|
-
LanguageEnum.EN: [
|
105
|
-
"Monday",
|
106
|
-
"Tuesday",
|
107
|
-
"Wednesday",
|
108
|
-
"Thursday",
|
109
|
-
"Friday",
|
110
|
-
"Saturday",
|
111
|
-
"Sunday",
|
112
|
-
]
|
113
|
-
}
|
114
|
-
|
115
|
-
MONTH_DICT = {
|
116
|
-
LanguageEnum.CN: [
|
117
|
-
"1月",
|
118
|
-
"2月",
|
119
|
-
"3月",
|
120
|
-
"4月",
|
121
|
-
"5月",
|
122
|
-
"6月",
|
123
|
-
"7月",
|
124
|
-
"8月",
|
125
|
-
"9月",
|
126
|
-
"10月",
|
127
|
-
"11月",
|
128
|
-
"12月",
|
129
|
-
],
|
130
|
-
LanguageEnum.EN: [
|
131
|
-
"January",
|
132
|
-
"February",
|
133
|
-
"March",
|
134
|
-
"April",
|
135
|
-
"May",
|
136
|
-
"June",
|
137
|
-
"July",
|
138
|
-
"August",
|
139
|
-
"September",
|
140
|
-
"October",
|
141
|
-
"November",
|
142
|
-
"December",
|
143
|
-
]
|
144
|
-
}
|
145
|
-
|
146
|
-
# Constants for the word 'none' in different languages
|
147
|
-
NONE_WORD = {
|
148
|
-
LanguageEnum.CN: "无",
|
149
|
-
LanguageEnum.EN: "none"
|
150
|
-
}
|
151
|
-
|
152
|
-
# Constants for the word 'repeated' in different languages
|
153
|
-
REPEATED_WORD = {
|
154
|
-
LanguageEnum.CN: "重复",
|
155
|
-
LanguageEnum.EN: "repeated"
|
156
|
-
}
|
157
|
-
|
158
|
-
# Constants for the word 'contradictory' in different languages
|
159
|
-
CONTRADICTORY_WORD = {
|
160
|
-
LanguageEnum.CN: "矛盾",
|
161
|
-
LanguageEnum.EN: "contradiction"
|
162
|
-
}
|
163
|
-
|
164
|
-
# Constants for the phrase 'included' in different languages
|
165
|
-
CONTAINED_WORD = {
|
166
|
-
LanguageEnum.CN: "被包含",
|
167
|
-
LanguageEnum.EN: "contained"
|
168
|
-
}
|
169
|
-
|
170
|
-
# Constants for the symbol ':' in different languages' representations
|
171
|
-
COLON_WORD = {
|
172
|
-
LanguageEnum.CN: ":",
|
173
|
-
LanguageEnum.EN: ":"
|
174
|
-
}
|
175
|
-
|
176
|
-
# Constants for the symbol ',' in different languages' representations
|
177
|
-
COMMA_WORD = {
|
178
|
-
LanguageEnum.CN: ",",
|
179
|
-
LanguageEnum.EN: ","
|
180
|
-
}
|
181
|
-
|
182
|
-
# Default human name placeholders for different languages
|
183
|
-
DEFAULT_HUMAN_NAME = {
|
184
|
-
LanguageEnum.CN: "用户",
|
185
|
-
LanguageEnum.EN: "user"
|
186
|
-
}
|
187
|
-
|
188
|
-
# Mapping of datetime terms from natural language to standardized keys for each supported language
|
189
|
-
DATATIME_KEY_MAP = {
|
190
|
-
LanguageEnum.CN: {
|
191
|
-
"年": "year",
|
192
|
-
"月": "month",
|
193
|
-
"日": "day",
|
194
|
-
"周": "week",
|
195
|
-
"星期几": "weekday",
|
196
|
-
},
|
197
|
-
LanguageEnum.EN: {
|
198
|
-
"Year": "year",
|
199
|
-
"Month": "month",
|
200
|
-
"Day": "day",
|
201
|
-
"Week": "week",
|
202
|
-
"Weekday": "weekday",
|
203
|
-
}
|
204
|
-
}
|
205
|
-
|
206
|
-
# Phrase for indicating inferred time in different languages
|
207
|
-
TIME_INFER_WORD = {
|
208
|
-
LanguageEnum.CN: "推断时间",
|
209
|
-
LanguageEnum.EN: "Inference time"
|
210
|
-
}
|
211
|
-
|
212
|
-
USER_NAME_EXPRESSION = {
|
213
|
-
LanguageEnum.CN: "用户姓名是{name}。",
|
214
|
-
LanguageEnum.EN: "User's name is {name}."
|
215
|
-
}
|
@@ -1,50 +0,0 @@
|
|
1
|
-
from flowllm import C, BaseOp
|
2
|
-
from loguru import logger
|
3
|
-
|
4
|
-
from reme_ai.utils.miner_u_pdf_processor import MinerUPDFProcessor, chunk_pdf_content
|
5
|
-
|
6
|
-
|
7
|
-
@C.register_op()
|
8
|
-
class PDFPreprocessOp(BaseOp):
|
9
|
-
file_path: str = __file__
|
10
|
-
|
11
|
-
def execute(self):
|
12
|
-
"""Process PDF files using MinerU and chunk content"""
|
13
|
-
pdf_path = self.context.get("pdf_path")
|
14
|
-
output_dir = self.context.get("output_dir")
|
15
|
-
|
16
|
-
if not pdf_path:
|
17
|
-
logger.error("No PDF path provided in context")
|
18
|
-
return
|
19
|
-
|
20
|
-
# Process PDF
|
21
|
-
processor = MinerUPDFProcessor(log_level="INFO")
|
22
|
-
|
23
|
-
try:
|
24
|
-
content_list, markdown_content = processor.process_pdf(
|
25
|
-
pdf_path=pdf_path,
|
26
|
-
output_dir=output_dir,
|
27
|
-
method=self.op_params.get("method", "auto"),
|
28
|
-
lang=self.op_params.get("lang"),
|
29
|
-
backend=self.op_params.get("backend", "pipeline")
|
30
|
-
)
|
31
|
-
|
32
|
-
# Create chunks if requested
|
33
|
-
chunks = []
|
34
|
-
if self.op_params.get("create_chunks", True):
|
35
|
-
max_length = self.op_params.get("max_chunk_length", 4000)
|
36
|
-
chunks = chunk_pdf_content(content_list, max_length=max_length)
|
37
|
-
|
38
|
-
# Store results in context
|
39
|
-
self.context.pdf_content_list = content_list
|
40
|
-
self.context.pdf_markdown_content = markdown_content
|
41
|
-
self.context.pdf_chunks = chunks
|
42
|
-
|
43
|
-
logger.info(f"PDF processing completed: {len(content_list)} content blocks, "
|
44
|
-
f"{len(chunks)} chunks, {len(markdown_content)} characters of markdown")
|
45
|
-
|
46
|
-
except Exception as e:
|
47
|
-
logger.error(f"PDF processing failed: {e}")
|
48
|
-
self.context.pdf_content_list = []
|
49
|
-
self.context.pdf_markdown_content = ""
|
50
|
-
self.context.pdf_chunks = []
|