auto-coder 0.1.348__py3-none-any.whl → 0.1.349__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of auto-coder might be problematic. Click here for more details.

Files changed (35) hide show
  1. {auto_coder-0.1.348.dist-info → auto_coder-0.1.349.dist-info}/METADATA +1 -1
  2. {auto_coder-0.1.348.dist-info → auto_coder-0.1.349.dist-info}/RECORD +35 -26
  3. autocoder/auto_coder_runner.py +14 -10
  4. autocoder/chat_auto_coder_lang.py +5 -3
  5. autocoder/common/model_speed_tester.py +392 -0
  6. autocoder/common/printer.py +7 -8
  7. autocoder/common/run_cmd.py +247 -0
  8. autocoder/common/test_run_cmd.py +110 -0
  9. autocoder/common/v2/agent/agentic_edit.py +61 -11
  10. autocoder/common/v2/agent/agentic_edit_conversation.py +9 -0
  11. autocoder/common/v2/agent/agentic_edit_tools/execute_command_tool_resolver.py +21 -36
  12. autocoder/common/v2/agent/agentic_edit_tools/list_files_tool_resolver.py +4 -7
  13. autocoder/common/v2/agent/agentic_edit_tools/search_files_tool_resolver.py +2 -5
  14. autocoder/helper/rag_doc_creator.py +141 -0
  15. autocoder/ignorefiles/__init__.py +4 -0
  16. autocoder/ignorefiles/ignore_file_utils.py +63 -0
  17. autocoder/ignorefiles/test_ignore_file_utils.py +91 -0
  18. autocoder/models.py +49 -9
  19. autocoder/rag/cache/byzer_storage_cache.py +10 -4
  20. autocoder/rag/cache/file_monitor_cache.py +27 -24
  21. autocoder/rag/cache/local_byzer_storage_cache.py +11 -5
  22. autocoder/rag/cache/local_duckdb_storage_cache.py +203 -128
  23. autocoder/rag/cache/simple_cache.py +56 -37
  24. autocoder/rag/loaders/filter_utils.py +106 -0
  25. autocoder/rag/loaders/image_loader.py +45 -23
  26. autocoder/rag/loaders/pdf_loader.py +3 -3
  27. autocoder/rag/loaders/test_image_loader.py +209 -0
  28. autocoder/rag/qa_conversation_strategy.py +3 -5
  29. autocoder/rag/utils.py +20 -9
  30. autocoder/utils/_markitdown.py +35 -0
  31. autocoder/version.py +1 -1
  32. {auto_coder-0.1.348.dist-info → auto_coder-0.1.349.dist-info}/LICENSE +0 -0
  33. {auto_coder-0.1.348.dist-info → auto_coder-0.1.349.dist-info}/WHEEL +0 -0
  34. {auto_coder-0.1.348.dist-info → auto_coder-0.1.349.dist-info}/entry_points.txt +0 -0
  35. {auto_coder-0.1.348.dist-info → auto_coder-0.1.349.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,5 @@
1
1
  from multiprocessing import Pool
2
+ import functools
2
3
  from autocoder.common import SourceCode
3
4
  from autocoder.rag.cache.base_cache import (
4
5
  BaseCacheManager, DeleteEvent, AddOrUpdateEvent,
@@ -20,6 +21,9 @@ from autocoder.rag.utils import process_file_in_multi_process, process_file_loca
20
21
  from autocoder.rag.variable_holder import VariableHolder
21
22
  import hashlib
22
23
  from .failed_files_utils import load_failed_files, save_failed_files
24
+ from autocoder.common import AutoCoderArgs
25
+ from byzerllm import SimpleByzerLLM, ByzerLLM
26
+ from autocoder.utils.llms import get_llm_names
23
27
 
24
28
 
25
29
  default_ignore_dirs = [
@@ -46,16 +50,16 @@ def generate_content_md5(content: Union[str, bytes]) -> str:
46
50
 
47
51
 
48
52
  class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
49
- def __init__(self, path: str, ignore_spec, required_exts: list, update_interval: int = 5, args=None, llm=None):
53
+ def __init__(self, path: str, ignore_spec, required_exts: list, update_interval: int = 5, args: Optional[AutoCoderArgs] = None, llm: Optional[Union[ByzerLLM, SimpleByzerLLM, str]] = None):
50
54
  """
51
55
  初始化异步更新队列,用于管理代码文件的缓存。
52
-
56
+
53
57
  参数:
54
58
  path: 需要索引的代码库根目录
55
59
  ignore_spec: 指定哪些文件/目录应被忽略的规则
56
60
  required_exts: 需要处理的文件扩展名列表
57
61
  update_interval: 自动触发更新的时间间隔(秒),默认为5秒
58
-
62
+
59
63
  缓存结构 (self.cache):
60
64
  self.cache 是一个字典,其结构如下:
61
65
  {
@@ -69,23 +73,23 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
69
73
  "file_path2": { ... },
70
74
  ...
71
75
  }
72
-
76
+
73
77
  这个缓存保存在项目根目录的 .cache/cache.jsonl 文件中,采用 JSONL 格式存储。
74
78
  每次启动时从磁盘加载,并在文件变更时异步更新。
75
-
79
+
76
80
  源代码处理函数:
77
81
  在缓存更新过程中使用了两个关键函数:
78
-
82
+
79
83
  1. process_file_in_multi_process: 在多进程环境中处理文件
80
84
  - 参数: file_info (文件信息元组)
81
85
  - 返回值: List[SourceCode] 或 None
82
86
  - 用途: 在初始加载时并行处理多个文件
83
-
87
+
84
88
  2. process_file_local: 在当前进程中处理单个文件
85
89
  - 参数: file_path (文件路径)
86
90
  - 返回值: List[SourceCode] 或 None
87
91
  - 用途: 在检测到文件更新时处理单个文件
88
-
92
+
89
93
  这两个函数返回的 SourceCode 对象列表会通过 model_dump() 方法序列化为字典,
90
94
  然后存储在缓存的 "content" 字段中。如果返回为空,则跳过缓存更新。
91
95
  """
@@ -94,6 +98,7 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
94
98
  self.required_exts = required_exts
95
99
  self.args = args
96
100
  self.llm = llm
101
+ self.product_mode = args.product_mode or "lite"
97
102
  self.update_interval = update_interval
98
103
  self.queue = []
99
104
  self.cache = {} # 初始化为空字典,稍后通过 read_cache() 填充
@@ -101,7 +106,8 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
101
106
  self.stop_event = threading.Event()
102
107
 
103
108
  # 用于存放解析失败的文件路径集合
104
- self.failed_files_path = os.path.join(self.path, ".cache", "failed_files.json")
109
+ self.failed_files_path = os.path.join(
110
+ self.path, ".cache", "failed_files.json")
105
111
  self.failed_files = load_failed_files(self.failed_files_path)
106
112
 
107
113
  # 启动处理队列的线程
@@ -116,7 +122,6 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
116
122
 
117
123
  self.cache = self.read_cache()
118
124
 
119
-
120
125
  def _process_queue(self):
121
126
  while not self.stop_event.is_set():
122
127
  try:
@@ -124,12 +129,13 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
124
129
  except Exception as e:
125
130
  logger.error(f"Error in process_queue: {e}")
126
131
  time.sleep(1) # 避免过于频繁的检查
127
-
132
+
128
133
  def _periodic_update(self):
129
134
  """定时触发文件更新检查"""
130
- while not self.stop_event.is_set():
135
+ while not self.stop_event.is_set():
131
136
  try:
132
- logger.debug(f"Periodic update triggered (every {self.update_interval}s)")
137
+ logger.debug(
138
+ f"Periodic update triggered (every {self.update_interval}s)")
133
139
  # 如果没有被初始化过,不会增量触发
134
140
  if not self.cache:
135
141
  time.sleep(self.update_interval)
@@ -145,7 +151,7 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
145
151
  self.update_thread.join()
146
152
 
147
153
  def fileinfo_to_tuple(self, file_info: FileInfo) -> Tuple[str, str, float, str]:
148
- return (file_info.file_path, file_info.relative_path, file_info.modify_time, file_info.file_md5)
154
+ return (file_info.file_path, file_info.relative_path, file_info.modify_time, file_info.file_md5)
149
155
 
150
156
  def __del__(self):
151
157
  self.stop()
@@ -159,7 +165,7 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
159
165
  file_path, _, modify_time, file_md5 = file_info
160
166
  if (
161
167
  file_path not in self.cache
162
- or self.cache[file_path].get("md5","") != file_md5
168
+ or self.cache[file_path].get("md5", "") != file_md5
163
169
  ):
164
170
  files_to_process.append(file_info)
165
171
  if not files_to_process:
@@ -169,20 +175,23 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
169
175
  # [process_file.remote(file_info) for file_info in files_to_process]
170
176
  # )
171
177
  from autocoder.rag.token_counter import initialize_tokenizer
172
-
178
+ llm_name = get_llm_names(self.llm)[0] if self.llm else None
173
179
  with Pool(
174
180
  processes=os.cpu_count(),
175
181
  initializer=initialize_tokenizer,
176
182
  initargs=(VariableHolder.TOKENIZER_PATH,),
177
183
  ) as pool:
178
- results = pool.map(
179
- process_file_in_multi_process, files_to_process)
184
+
185
+ worker_func = functools.partial(
186
+ process_file_in_multi_process, llm=llm_name, product_mode=self.product_mode)
187
+ results = pool.map(worker_func, files_to_process)
180
188
 
181
189
  for file_info, result in zip(files_to_process, results):
182
190
  if result: # 只有当result不为空时才更新缓存
183
191
  self.update_cache(file_info, result)
184
192
  else:
185
- logger.warning(f"Empty result for file: {file_info[0]}, skipping cache update")
193
+ logger.warning(
194
+ f"Empty result for file: {file_info[0]}, skipping cache update")
186
195
 
187
196
  self.write_cache()
188
197
 
@@ -195,14 +204,15 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
195
204
  current_files.add(file_path)
196
205
  # 如果文件曾经解析失败,跳过本次增量更新
197
206
  if file_path in self.failed_files:
198
- logger.info(f"文件 {file_path} 之前解析失败,跳过此次更新")
207
+ # logger.info(f"文件 {file_path} 之前解析失败,跳过此次更新")
199
208
  continue
200
209
  # 变更检测
201
210
  if (
202
211
  file_path not in self.cache
203
212
  or self.cache[file_path].get("md5", "") != file_md5
204
213
  ):
205
- files_to_process.append((file_path, relative_path, modify_time, file_md5))
214
+ files_to_process.append(
215
+ (file_path, relative_path, modify_time, file_md5))
206
216
 
207
217
  deleted_files = set(self.cache.keys()) - current_files
208
218
  logger.info(f"files_to_process: {files_to_process}")
@@ -231,29 +241,38 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
231
241
  # 删除时也从失败列表中移除(防止文件已修复)
232
242
  if item in self.failed_files:
233
243
  self.failed_files.remove(item)
234
- save_failed_files(self.failed_files_path, self.failed_files)
244
+ save_failed_files(
245
+ self.failed_files_path, self.failed_files)
235
246
  elif isinstance(file_list, AddOrUpdateEvent):
236
247
  for file_info in file_list.file_infos:
237
- logger.info(f"{file_info.file_path} is detected to be updated")
248
+ logger.info(
249
+ f"{file_info.file_path} is detected to be updated")
238
250
  try:
239
- result = process_file_local(file_info.file_path)
251
+ result = process_file_local(
252
+ file_info.file_path, llm=self.llm, product_mode=self.product_mode)
240
253
  if result:
241
254
  # 解析成功且非空
242
- self.update_cache(self.fileinfo_to_tuple(file_info), result)
255
+ self.update_cache(
256
+ self.fileinfo_to_tuple(file_info), result)
243
257
  # 如果之前失败过且本次成功,移除失败记录
244
258
  if file_info.file_path in self.failed_files:
245
259
  self.failed_files.remove(file_info.file_path)
246
- save_failed_files(self.failed_files_path, self.failed_files)
260
+ save_failed_files(
261
+ self.failed_files_path, self.failed_files)
247
262
  else:
248
263
  # 只要为空也认为解析失败,加入失败列表
249
- logger.warning(f"Empty result for file: {file_info.file_path}, treat as parse failed, skipping cache update")
264
+ logger.warning(
265
+ f"Empty result for file: {file_info.file_path}, treat as parse failed, skipping cache update")
250
266
  self.failed_files.add(file_info.file_path)
251
- save_failed_files(self.failed_files_path, self.failed_files)
267
+ save_failed_files(
268
+ self.failed_files_path, self.failed_files)
252
269
  except Exception as e:
253
- logger.error(f"SimpleCache Error in process_queue: {e}")
270
+ logger.error(
271
+ f"SimpleCache Error in process_queue: {e}")
254
272
  # 解析失败则加入失败列表
255
273
  self.failed_files.add(file_info.file_path)
256
- save_failed_files(self.failed_files_path, self.failed_files)
274
+ save_failed_files(
275
+ self.failed_files_path, self.failed_files)
257
276
 
258
277
  self.write_cache()
259
278
 
@@ -266,7 +285,7 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
266
285
 
267
286
  cache = {}
268
287
  if os.path.exists(cache_file):
269
- with open(cache_file, "r",encoding="utf-8") as f:
288
+ with open(cache_file, "r", encoding="utf-8") as f:
270
289
  for line in f:
271
290
  data = json.loads(line)
272
291
  cache[data["file_path"]] = data
@@ -277,7 +296,7 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
277
296
  cache_file = os.path.join(cache_dir, "cache.jsonl")
278
297
 
279
298
  if not fcntl:
280
- with open(cache_file, "w",encoding="utf-8") as f:
299
+ with open(cache_file, "w", encoding="utf-8") as f:
281
300
  for data in self.cache.values():
282
301
  try:
283
302
  json.dump(data, f, ensure_ascii=False)
@@ -287,12 +306,12 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
287
306
  f"Failed to write {data['file_path']} to .cache/cache.jsonl: {e}")
288
307
  else:
289
308
  lock_file = cache_file + ".lock"
290
- with open(lock_file, "w",encoding="utf-8") as lockf:
309
+ with open(lock_file, "w", encoding="utf-8") as lockf:
291
310
  try:
292
311
  # 获取文件锁
293
312
  fcntl.flock(lockf, fcntl.LOCK_EX | fcntl.LOCK_NB)
294
313
  # 写入缓存文件
295
- with open(cache_file, "w",encoding="utf-8") as f:
314
+ with open(cache_file, "w", encoding="utf-8") as f:
296
315
  for data in self.cache.values():
297
316
  try:
298
317
  json.dump(data, f, ensure_ascii=False)
@@ -310,11 +329,11 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
310
329
  ):
311
330
  """
312
331
  更新缓存中的文件信息。
313
-
332
+
314
333
  参数:
315
334
  file_info: 包含文件信息的元组 (file_path, relative_path, modify_time, file_md5)
316
335
  content: 解析后的文件内容,SourceCode 对象列表
317
-
336
+
318
337
  说明:
319
338
  此方法将文件的最新内容更新到缓存中。缓存项的结构为:
320
339
  {
@@ -324,7 +343,7 @@ class AutoCoderRAGAsyncUpdateQueue(BaseCacheManager):
324
343
  "modify_time": float, # 文件最后修改时间的时间戳
325
344
  "md5": str # 文件内容的 MD5 哈希值,用于检测变更
326
345
  }
327
-
346
+
328
347
  该方法不会立即写入磁盘,需调用 write_cache() 方法将更新后的缓存持久化。
329
348
  """
330
349
  file_path, relative_path, modify_time, file_md5 = file_info
@@ -0,0 +1,106 @@
1
+
2
+ import os
3
+ import json
4
+ from typing import Dict, Optional
5
+ from loguru import logger
6
+
7
+ class FilterRuleManager:
8
+ '''
9
+ {
10
+ "whitelist": [
11
+ "glob:*.png",
12
+ "regex:^/tmp/.*hidden.*"
13
+ ],
14
+ "blacklist": [
15
+ "glob:*/private/*",
16
+ "regex:.*/secret/.*\\.jpg$"
17
+ ]
18
+ }
19
+ '''
20
+ _cache_rules: Optional[Dict] = None
21
+ _cache_mtime: Optional[float] = None
22
+
23
+ def __init__(self, llm, source_dir: str):
24
+ """
25
+ 初始化过滤规则管理器
26
+
27
+ 参数:
28
+ llm: 大模型对象,当前未使用,预留
29
+ source_dir: 项目根目录路径
30
+ """
31
+ self.llm = llm
32
+ self.source_dir = source_dir
33
+ self.filter_rules_path = os.path.join(self.source_dir, ".cache", "filterrules")
34
+
35
+ def load_filter_rules(self) -> Dict:
36
+ try:
37
+ current_mtime = os.path.getmtime(self.filter_rules_path) if os.path.exists(self.filter_rules_path) else None
38
+ except Exception:
39
+ current_mtime = None
40
+
41
+ need_reload = False
42
+
43
+ # 如果缓存为空,或者文件已更新,触发重新加载
44
+ if FilterRuleManager._cache_rules is None:
45
+ need_reload = True
46
+ elif current_mtime is not None and FilterRuleManager._cache_mtime != current_mtime:
47
+ need_reload = True
48
+
49
+ if need_reload:
50
+ FilterRuleManager._cache_rules = {"whitelist": [], "blacklist": []}
51
+ try:
52
+ if os.path.exists(self.filter_rules_path):
53
+ with open(self.filter_rules_path, "r", encoding="utf-8") as f:
54
+ FilterRuleManager._cache_rules = json.load(f)
55
+ FilterRuleManager._cache_mtime = current_mtime
56
+ except Exception as e:
57
+ logger.warning(f"Failed to load filterrules: {e}")
58
+
59
+ return FilterRuleManager._cache_rules or {"whitelist": [], "blacklist": []}
60
+
61
+ def should_parse_image(self, file_path: str) -> bool:
62
+ """
63
+ 判断某个文件是否需要对图片进行解析。
64
+
65
+ 支持规则格式:
66
+ - glob通配符匹配,示例:"glob:*.png" 或 "*.png"
67
+ - 正则表达式匹配,示例:"regex:^/tmp/.*hidden.*"
68
+
69
+ 返回:
70
+ True 表示应该解析
71
+ False 表示不解析
72
+ """
73
+ import fnmatch
74
+ import re
75
+
76
+ rules = self.load_filter_rules()
77
+ whitelist = rules.get("whitelist", [])
78
+ blacklist = rules.get("blacklist", [])
79
+
80
+ def match_pattern(pattern: str, path: str) -> bool:
81
+ if pattern.startswith("glob:"):
82
+ pat = pattern[len("glob:"):]
83
+ return fnmatch.fnmatch(path, pat)
84
+ elif pattern.startswith("regex:"):
85
+ pat = pattern[len("regex:"):]
86
+ try:
87
+ return re.search(pat, path) is not None
88
+ except re.error:
89
+ logger.warning(f"Invalid regex pattern: {pat}")
90
+ return False
91
+ else:
92
+ # 默认按glob处理
93
+ return fnmatch.fnmatch(path, pattern)
94
+
95
+ # 优先匹配黑名单
96
+ for pattern in blacklist:
97
+ if match_pattern(pattern, file_path):
98
+ return False
99
+
100
+ # 再匹配白名单
101
+ for pattern in whitelist:
102
+ if match_pattern(pattern, file_path):
103
+ return True
104
+
105
+ # 默认不解析
106
+ return False
@@ -18,6 +18,7 @@ from byzerllm.utils.client import code_utils
18
18
  from autocoder.utils.llms import get_single_llm
19
19
  from loguru import logger
20
20
  from typing import List, Tuple, Optional
21
+ from autocoder.common.text import TextSimilarity
21
22
  from pydantic import BaseModel
22
23
 
23
24
 
@@ -280,6 +281,20 @@ class ImageLoader:
280
281
  except Exception:
281
282
  traceback.print_exc()
282
283
  return ""
284
+
285
+ @staticmethod
286
+ def extract_replace_in_file_tools(response)->List[ReplaceInFileTool]:
287
+ tools = []
288
+ # Pattern to match replace_in_file tool blocks
289
+ pattern = r'<replace_in_file>\s*<path>(.*?)</path>\s*<diff>(.*?)</diff>\s*</replace_in_file>'
290
+ matches = re.finditer(pattern, response, re.DOTALL)
291
+
292
+ for match in matches:
293
+ path = match.group(1).strip()
294
+ diff = match.group(2).strip()
295
+ tools.append(ReplaceInFileTool(path=path, diff=diff))
296
+
297
+ return tools
283
298
 
284
299
  @staticmethod
285
300
  def format_table_in_content(content: str, llm=None) -> str:
@@ -406,35 +421,42 @@ class ImageLoader:
406
421
  '''
407
422
 
408
423
  # Run the prompt with the provided content
409
- tool_response = _format_table.with_llm(llm).run(content)
410
-
411
- # Parse the tool response to extract replace_in_file tool calls
412
- def extract_replace_in_file_tools(response):
413
- tools = []
414
- # Pattern to match replace_in_file tool blocks
415
- pattern = r'<replace_in_file>\s*<path>(.*?)</path>\s*<diff>(.*?)</diff>\s*</replace_in_file>'
416
- matches = re.finditer(pattern, response, re.DOTALL)
417
-
418
- for match in matches:
419
- path = match.group(1).strip()
420
- diff = match.group(2).strip()
421
- tools.append(ReplaceInFileTool(path=path, diff=diff))
422
-
423
- return tools
424
+ tool_response = _format_table.with_llm(llm).run(content)
424
425
 
425
426
  # Extract tools from the response
426
- tools = extract_replace_in_file_tools(tool_response)
427
+ tools = ImageLoader.extract_replace_in_file_tools(tool_response)
427
428
 
428
429
  # Process each tool to apply the replacements
429
430
  formatted_content = content
430
431
  for tool in tools:
431
- # For in-memory content replacement (not actual file modification)
432
- if tool.path == "content":
433
- # Parse the diff to get search/replace blocks
434
- blocks = ImageLoader.parse_diff(tool.diff)
435
- # Apply each replacement to the content
436
- for search_block, replace_block in blocks:
437
- formatted_content = formatted_content.replace(search_block, replace_block)
432
+ # For in-memory content replacement (not actual file modification)
433
+ # Parse the diff to get search/replace blocks
434
+ blocks = ImageLoader.parse_diff(tool.diff)
435
+ # Apply each replacement to the content
436
+ for search_block, replace_block in blocks:
437
+ # Check if the search_block exists in the content
438
+ if search_block in formatted_content:
439
+ # Replace and verify the replacement occurred
440
+ new_content = formatted_content.replace(search_block, replace_block)
441
+ if new_content == formatted_content:
442
+ logger.warning(f"Replacement failed despite search block found. Search block length: {len(search_block)}")
443
+ print(f"\n=== FAILED SEARCH BLOCK ===\n{search_block}\n=== END FAILED SEARCH BLOCK ===\n")
444
+ formatted_content = new_content
445
+ else:
446
+ # Fallback to similarity matching when exact match fails
447
+ logger.warning(f"Search block not found in content. Trying similarity matching. Search block length: {len(search_block)}")
448
+ print(f"\n=== NOT FOUND SEARCH BLOCK (trying similarity) ===\n{search_block}\n=== END NOT FOUND SEARCH BLOCK ===\n")
449
+
450
+ # Use TextSimilarity to find the best matching window
451
+ similarity, best_window = TextSimilarity(search_block, formatted_content).get_best_matching_window()
452
+ similarity_threshold = 0.8 # Can be adjusted based on needs
453
+
454
+ if similarity > similarity_threshold:
455
+ logger.info(f"Found similar block with similarity {similarity:.2f}")
456
+ print(f"\n=== SIMILAR BLOCK FOUND (similarity: {similarity:.2f}) ===\n{best_window}\n=== END SIMILAR BLOCK ===\n")
457
+ formatted_content = formatted_content.replace(best_window, replace_block, 1)
458
+ else:
459
+ logger.warning(f"No similar block found. Best similarity: {similarity:.2f}")
438
460
 
439
461
  return formatted_content
440
462
 
@@ -14,9 +14,9 @@ def extract_text_from_pdf_old(file_path):
14
14
  text += page.extract_text()
15
15
  return text
16
16
 
17
- def extract_text_from_pdf(file_path):
18
- try:
19
- md_converter = MarkItDown()
17
+ def extract_text_from_pdf(file_path, llm=None, product_mode="lite"):
18
+ try:
19
+ md_converter = MarkItDown(llm=llm, product_mode=product_mode)
20
20
  result = md_converter.convert(file_path)
21
21
  return result.text_content
22
22
  except (BaseException, Exception) as e: