flowllm 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. flowllm/__init__.py +12 -0
  2. flowllm/app.py +25 -0
  3. flowllm/config/default_config.yaml +82 -0
  4. flowllm/config/pydantic_config_parser.py +242 -0
  5. flowllm/context/base_context.py +59 -0
  6. flowllm/context/flow_context.py +28 -0
  7. llmflow/op/prompt_mixin.py → flowllm/context/prompt_handler.py +25 -14
  8. flowllm/context/registry.py +26 -0
  9. flowllm/context/service_context.py +103 -0
  10. flowllm/embedding_model/__init__.py +1 -0
  11. {llmflow → flowllm}/embedding_model/base_embedding_model.py +2 -2
  12. {llmflow → flowllm}/embedding_model/openai_compatible_embedding_model.py +8 -8
  13. flowllm/flow_engine/__init__.py +1 -0
  14. flowllm/flow_engine/base_flow_engine.py +34 -0
  15. flowllm/flow_engine/simple_flow_engine.py +213 -0
  16. flowllm/llm/__init__.py +1 -0
  17. {llmflow → flowllm}/llm/base_llm.py +16 -24
  18. {llmflow → flowllm}/llm/openai_compatible_llm.py +64 -108
  19. flowllm/op/__init__.py +3 -0
  20. flowllm/op/akshare/get_ak_a_code_op.py +116 -0
  21. flowllm/op/akshare/get_ak_a_code_prompt.yaml +21 -0
  22. flowllm/op/akshare/get_ak_a_info_op.py +143 -0
  23. flowllm/op/base_op.py +169 -0
  24. flowllm/op/llm_base_op.py +63 -0
  25. flowllm/op/mock_op.py +42 -0
  26. flowllm/op/parallel_op.py +30 -0
  27. flowllm/op/sequential_op.py +29 -0
  28. flowllm/schema/flow_response.py +12 -0
  29. flowllm/schema/message.py +35 -0
  30. flowllm/schema/service_config.py +76 -0
  31. flowllm/schema/tool_call.py +110 -0
  32. flowllm/service/__init__.py +2 -0
  33. flowllm/service/base_service.py +59 -0
  34. flowllm/service/http_service.py +87 -0
  35. flowllm/service/mcp_service.py +45 -0
  36. flowllm/storage/__init__.py +1 -0
  37. flowllm/storage/vector_store/__init__.py +3 -0
  38. flowllm/storage/vector_store/base_vector_store.py +44 -0
  39. {llmflow → flowllm/storage}/vector_store/chroma_vector_store.py +11 -10
  40. {llmflow → flowllm/storage}/vector_store/es_vector_store.py +10 -9
  41. llmflow/vector_store/file_vector_store.py → flowllm/storage/vector_store/local_vector_store.py +110 -10
  42. flowllm/utils/common_utils.py +64 -0
  43. flowllm/utils/dataframe_cache.py +331 -0
  44. flowllm/utils/fetch_url.py +113 -0
  45. {llmflow → flowllm}/utils/timer.py +5 -4
  46. {flowllm-0.1.0.dist-info → flowllm-0.1.1.dist-info}/METADATA +31 -27
  47. flowllm-0.1.1.dist-info/RECORD +62 -0
  48. flowllm-0.1.1.dist-info/entry_points.txt +4 -0
  49. {flowllm-0.1.0.dist-info → flowllm-0.1.1.dist-info}/licenses/LICENSE +1 -1
  50. flowllm-0.1.1.dist-info/top_level.txt +1 -0
  51. flowllm-0.1.0.dist-info/RECORD +0 -66
  52. flowllm-0.1.0.dist-info/entry_points.txt +0 -3
  53. flowllm-0.1.0.dist-info/top_level.txt +0 -1
  54. llmflow/app.py +0 -53
  55. llmflow/config/config_parser.py +0 -80
  56. llmflow/config/mock_config.yaml +0 -58
  57. llmflow/embedding_model/__init__.py +0 -5
  58. llmflow/enumeration/agent_state.py +0 -8
  59. llmflow/llm/__init__.py +0 -5
  60. llmflow/mcp_server.py +0 -110
  61. llmflow/op/__init__.py +0 -10
  62. llmflow/op/base_op.py +0 -125
  63. llmflow/op/mock_op.py +0 -40
  64. llmflow/op/react/react_v1_op.py +0 -88
  65. llmflow/op/react/react_v1_prompt.yaml +0 -28
  66. llmflow/op/vector_store/__init__.py +0 -13
  67. llmflow/op/vector_store/recall_vector_store_op.py +0 -48
  68. llmflow/op/vector_store/update_vector_store_op.py +0 -28
  69. llmflow/op/vector_store/vector_store_action_op.py +0 -46
  70. llmflow/pipeline/pipeline.py +0 -94
  71. llmflow/pipeline/pipeline_context.py +0 -37
  72. llmflow/schema/app_config.py +0 -69
  73. llmflow/schema/experience.py +0 -144
  74. llmflow/schema/message.py +0 -68
  75. llmflow/schema/request.py +0 -32
  76. llmflow/schema/response.py +0 -29
  77. llmflow/service/__init__.py +0 -0
  78. llmflow/service/llmflow_service.py +0 -96
  79. llmflow/tool/__init__.py +0 -9
  80. llmflow/tool/base_tool.py +0 -80
  81. llmflow/tool/code_tool.py +0 -43
  82. llmflow/tool/dashscope_search_tool.py +0 -162
  83. llmflow/tool/mcp_tool.py +0 -77
  84. llmflow/tool/tavily_search_tool.py +0 -109
  85. llmflow/tool/terminate_tool.py +0 -23
  86. llmflow/utils/__init__.py +0 -0
  87. llmflow/utils/common_utils.py +0 -17
  88. llmflow/utils/file_handler.py +0 -25
  89. llmflow/utils/http_client.py +0 -156
  90. llmflow/utils/op_utils.py +0 -102
  91. llmflow/utils/registry.py +0 -33
  92. llmflow/vector_store/__init__.py +0 -7
  93. llmflow/vector_store/base_vector_store.py +0 -136
  94. {llmflow → flowllm/config}/__init__.py +0 -0
  95. {llmflow/config → flowllm/context}/__init__.py +0 -0
  96. {llmflow → flowllm}/enumeration/__init__.py +0 -0
  97. {llmflow → flowllm}/enumeration/chunk_enum.py +0 -0
  98. {llmflow → flowllm}/enumeration/http_enum.py +0 -0
  99. {llmflow → flowllm}/enumeration/role.py +0 -0
  100. {llmflow/op/react → flowllm/op/akshare}/__init__.py +0 -0
  101. {llmflow/pipeline → flowllm/schema}/__init__.py +0 -0
  102. {llmflow → flowllm}/schema/vector_node.py +0 -0
  103. {llmflow/schema → flowllm/utils}/__init__.py +0 -0
  104. {llmflow → flowllm}/utils/singleton.py +0 -0
  105. {flowllm-0.1.0.dist-info → flowllm-0.1.1.dist-info}/WHEEL +0 -0
@@ -1,18 +1,20 @@
1
+ import fcntl
2
+ import json
1
3
  import math
2
4
  from pathlib import Path
3
5
  from typing import List, Iterable
4
6
 
5
7
  from loguru import logger
6
8
  from pydantic import Field, model_validator
9
+ from tqdm import tqdm
7
10
 
8
- from llmflow.embedding_model.openai_compatible_embedding_model import OpenAICompatibleEmbeddingModel
9
- from llmflow.schema.vector_node import VectorNode
10
- from llmflow.vector_store import VECTOR_STORE_REGISTRY
11
- from llmflow.vector_store.base_vector_store import BaseVectorStore
11
+ from flowllm.context.service_context import C
12
+ from flowllm.schema.vector_node import VectorNode
13
+ from flowllm.storage.vector_store.base_vector_store import BaseVectorStore
12
14
 
13
15
 
14
- @VECTOR_STORE_REGISTRY.register("local_file")
15
- class FileVectorStore(BaseVectorStore):
16
+ @C.register_vector_store("local")
17
+ class LocalVectorStore(BaseVectorStore):
16
18
  store_dir: str = Field(default="./file_vector_store")
17
19
 
18
20
  @model_validator(mode="after")
@@ -21,6 +23,55 @@ class FileVectorStore(BaseVectorStore):
21
23
  store_path.mkdir(parents=True, exist_ok=True)
22
24
  return self
23
25
 
26
+ @staticmethod
27
+ def _load_from_path(workspace_id: str, path: str | Path, callback_fn=None, **kwargs) -> Iterable[VectorNode]:
28
+ workspace_path = Path(path) / f"{workspace_id}.jsonl"
29
+ if not workspace_path.exists():
30
+ logger.warning(f"workspace_path={workspace_path} is not exists!")
31
+ return
32
+
33
+ with workspace_path.open() as f:
34
+ fcntl.flock(f, fcntl.LOCK_SH)
35
+ try:
36
+ for line in tqdm(f, desc="load from path"):
37
+ if line.strip():
38
+ node_dict = json.loads(line.strip())
39
+ if callback_fn:
40
+ node = callback_fn(node_dict)
41
+ else:
42
+ node = VectorNode(**node_dict, **kwargs)
43
+ node.workspace_id = workspace_id
44
+ yield node
45
+
46
+ finally:
47
+ fcntl.flock(f, fcntl.LOCK_UN)
48
+
49
+ @staticmethod
50
+ def _dump_to_path(nodes: Iterable[VectorNode], workspace_id: str, path: str | Path = "", callback_fn=None,
51
+ ensure_ascii: bool = False, **kwargs):
52
+ dump_path: Path = Path(path)
53
+ dump_path.mkdir(parents=True, exist_ok=True)
54
+ dump_file = dump_path / f"{workspace_id}.jsonl"
55
+
56
+ count = 0
57
+ with dump_file.open("w") as f:
58
+ fcntl.flock(f, fcntl.LOCK_EX)
59
+ try:
60
+ for node in tqdm(nodes, desc="dump to path"):
61
+ node.workspace_id = workspace_id
62
+ if callback_fn:
63
+ node_dict = callback_fn(node)
64
+ else:
65
+ node_dict = node.model_dump()
66
+ assert isinstance(node_dict, dict)
67
+ f.write(json.dumps(node_dict, ensure_ascii=ensure_ascii, **kwargs))
68
+ f.write("\n")
69
+ count += 1
70
+
71
+ return {"size": count}
72
+ finally:
73
+ fcntl.flock(f, fcntl.LOCK_UN)
74
+
24
75
  @property
25
76
  def store_path(self) -> Path:
26
77
  return Path(self.store_dir)
@@ -41,6 +92,54 @@ class FileVectorStore(BaseVectorStore):
41
92
  for i, node in enumerate(self._load_from_path(path=self.store_path, workspace_id=workspace_id, **kwargs)):
42
93
  yield node
43
94
 
95
+ def dump_workspace(self, workspace_id: str, path: str | Path = "", callback_fn=None, **kwargs):
96
+ if not self.exist_workspace(workspace_id=workspace_id, **kwargs):
97
+ logger.warning(f"workspace_id={workspace_id} is not exist!")
98
+ return {}
99
+
100
+ return self._dump_to_path(nodes=self._iter_workspace_nodes(workspace_id=workspace_id, **kwargs),
101
+ workspace_id=workspace_id,
102
+ path=path,
103
+ callback_fn=callback_fn,
104
+ **kwargs)
105
+
106
+ def load_workspace(self, workspace_id: str, path: str | Path = "", nodes: List[VectorNode] = None, callback_fn=None,
107
+ **kwargs):
108
+ if self.exist_workspace(workspace_id, **kwargs):
109
+ self.delete_workspace(workspace_id=workspace_id, **kwargs)
110
+ logger.info(f"delete workspace_id={workspace_id}")
111
+
112
+ self.create_workspace(workspace_id=workspace_id, **kwargs)
113
+
114
+ all_nodes: List[VectorNode] = []
115
+ if nodes:
116
+ all_nodes.extend(nodes)
117
+ for node in self._load_from_path(path=path, workspace_id=workspace_id, callback_fn=callback_fn, **kwargs):
118
+ all_nodes.append(node)
119
+ self.insert(nodes=all_nodes, workspace_id=workspace_id, **kwargs)
120
+ return {"size": len(all_nodes)}
121
+
122
+ def copy_workspace(self, src_workspace_id: str, dest_workspace_id: str, **kwargs):
123
+ if not self.exist_workspace(workspace_id=src_workspace_id, **kwargs):
124
+ logger.warning(f"src_workspace_id={src_workspace_id} is not exist!")
125
+ return {}
126
+
127
+ if not self.exist_workspace(dest_workspace_id, **kwargs):
128
+ self.create_workspace(workspace_id=dest_workspace_id, **kwargs)
129
+
130
+ nodes = []
131
+ node_size = 0
132
+ for node in self._iter_workspace_nodes(workspace_id=src_workspace_id, **kwargs):
133
+ nodes.append(node)
134
+ node_size += 1
135
+ if len(nodes) >= self.batch_size:
136
+ self.insert(nodes=nodes, workspace_id=dest_workspace_id, **kwargs)
137
+ nodes.clear()
138
+
139
+ if nodes:
140
+ self.insert(nodes=nodes, workspace_id=dest_workspace_id, **kwargs)
141
+ return {"size": node_size}
142
+
44
143
  @staticmethod
45
144
  def calculate_similarity(query_vector: List[float], node_vector: List[float]):
46
145
  assert query_vector, f"query_vector is empty!"
@@ -106,12 +205,14 @@ class FileVectorStore(BaseVectorStore):
106
205
 
107
206
 
108
207
  def main():
109
- from dotenv import load_dotenv
110
- load_dotenv()
208
+ from flowllm.utils.common_utils import load_env
209
+ from flowllm.embedding_model import OpenAICompatibleEmbeddingModel
210
+
211
+ load_env()
111
212
 
112
213
  embedding_model = OpenAICompatibleEmbeddingModel(dimensions=64, model_name="text-embedding-v4")
113
214
  workspace_id = "rag_nodes_index"
114
- client = FileVectorStore(embedding_model=embedding_model)
215
+ client = LocalVectorStore(embedding_model=embedding_model)
115
216
  client.delete_workspace(workspace_id)
116
217
  client.create_workspace(workspace_id)
117
218
 
@@ -160,4 +261,3 @@ def main():
160
261
 
161
262
  if __name__ == "__main__":
162
263
  main()
163
- # launch with: python -m llmflow.storage.file_vector_store
@@ -0,0 +1,64 @@
1
+ import os
2
+ import re
3
+ from pathlib import Path
4
+
5
+ from loguru import logger
6
+
7
+
8
+ def camel_to_snake(content: str) -> str:
9
+ """
10
+ BaseWorker -> base_worker
11
+ """
12
+ snake_str = re.sub(r'(?<!^)(?=[A-Z])', '_', content).lower()
13
+ return snake_str
14
+
15
+
16
+ def snake_to_camel(content: str) -> str:
17
+ """
18
+ base_worker -> BaseWorker
19
+ """
20
+ camel_str = "".join(x.capitalize() for x in content.split("_"))
21
+ return camel_str
22
+
23
+
24
+ def _load_env(path: Path):
25
+ with path.open() as f:
26
+ for line in f:
27
+ line = line.strip()
28
+ if line.startswith("#"):
29
+ continue
30
+
31
+ line_split = line.strip().split("=", 1)
32
+ if len(line_split) >= 2:
33
+ key = line_split[0].strip()
34
+ value = line_split[1].strip()
35
+ os.environ[key] = value
36
+
37
+
38
+ def load_env(path: str | Path = None):
39
+ if path is not None:
40
+ path = Path(path)
41
+ if path.exists():
42
+ _load_env(path)
43
+ else:
44
+ path1 = Path(".env")
45
+ path2 = Path("../.env")
46
+ path3 = Path("../../.env")
47
+ path4 = Path("../../../.env")
48
+ path5 = Path("../../../.env")
49
+
50
+ if path1.exists():
51
+ path = path1
52
+ elif path2.exists():
53
+ path = path2
54
+ elif path3.exists():
55
+ path = path3
56
+ elif path4.exists():
57
+ path = path4
58
+ elif path5.exists():
59
+ path = path5
60
+ else:
61
+ raise FileNotFoundError(".env not found")
62
+
63
+ logger.info(f"using path={path}")
64
+ _load_env(path)
@@ -0,0 +1,331 @@
1
+ """
2
+ DataFrame cache utility that supports local CSV file storage and reading with data expiration functionality
3
+ """
4
+
5
+ import json
6
+ from datetime import datetime, timedelta
7
+ from pathlib import Path
8
+ from typing import Optional, Dict, Any
9
+
10
+ import pandas as pd
11
+
12
+
13
+ class DataFrameCache:
14
+ """
15
+ DataFrame cache utility class
16
+
17
+ Features:
18
+ - Support for pandas DataFrame local CSV storage and reading
19
+ - Support for data expiration time settings
20
+ - Automatic cleanup of expired data
21
+ - Recording and managing update timestamps
22
+ """
23
+
24
+ def __init__(self, cache_dir: str = "cache_df"):
25
+ self.cache_dir = Path(cache_dir)
26
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
27
+ self.metadata_file = self.cache_dir / "metadata.json"
28
+ self.metadata = {}
29
+ self._load_metadata()
30
+
31
+ def _load_metadata(self):
32
+ """Load metadata"""
33
+ if self.metadata_file.exists():
34
+ with open(self.metadata_file) as f:
35
+ self.metadata = json.load(f)
36
+
37
+ def _save_metadata(self):
38
+ """Save metadata"""
39
+ with open(self.metadata_file, "w") as f:
40
+ json.dump(self.metadata, f, ensure_ascii=False, indent=2)
41
+
42
+ def _get_file_path(self, key: str) -> Path:
43
+ """Get data file path"""
44
+ return self.cache_dir / f"{key}.csv"
45
+
46
+ def _is_expired(self, key: str) -> bool:
47
+ """Check if data is expired"""
48
+ if key not in self.metadata:
49
+ return True
50
+
51
+ expire_time_str = self.metadata[key].get('expire_time')
52
+ if not expire_time_str:
53
+ return False # No expiration time set, never expires
54
+
55
+ expire_time = datetime.fromisoformat(expire_time_str)
56
+ return datetime.now() > expire_time
57
+
58
+ def save(self, key: str, df: pd.DataFrame, expire_hours: Optional[float] = None,
59
+ **csv_kwargs) -> bool:
60
+ """
61
+ Save DataFrame to cache
62
+
63
+ Args:
64
+ key: Cache key name
65
+ df: DataFrame to save
66
+ expire_hours: Expiration time in hours, None means never expires
67
+ **csv_kwargs: Additional parameters passed to pandas to_csv
68
+
69
+ Returns:
70
+ bool: Whether save was successful
71
+ """
72
+ try:
73
+ file_path = self._get_file_path(key)
74
+
75
+ # Set default CSV parameters
76
+ csv_params = {
77
+ "index": False,
78
+ "encoding": "utf-8"
79
+ }
80
+ csv_params.update(csv_kwargs)
81
+
82
+ # Save CSV file
83
+ df.to_csv(file_path, **csv_params)
84
+
85
+ # Update metadata
86
+ current_time = datetime.now()
87
+ self.metadata[key] = {
88
+ 'created_time': current_time.isoformat(),
89
+ 'updated_time': current_time.isoformat(),
90
+ 'expire_time': (current_time + timedelta(hours=expire_hours)).isoformat() if expire_hours else None,
91
+ 'file_size': file_path.stat().st_size,
92
+ 'row_count': len(df),
93
+ 'column_count': len(df.columns)
94
+ }
95
+
96
+ self._save_metadata()
97
+ return True
98
+
99
+ except Exception as e:
100
+ print(f"Failed to save DataFrame: {e}")
101
+ return False
102
+
103
+ def load(self, key: str, auto_clean_expired: bool = True, **csv_kwargs) -> Optional[pd.DataFrame]:
104
+ """
105
+ Load DataFrame from cache
106
+
107
+ Args:
108
+ key: Cache key name
109
+ auto_clean_expired: Whether to automatically clean expired data
110
+ **csv_kwargs: Additional parameters passed to pandas read_csv
111
+
112
+ Returns:
113
+ Optional[pd.DataFrame]: Loaded DataFrame, returns None if not exists or expired
114
+ """
115
+ try:
116
+ # Check if expired
117
+ if self._is_expired(key):
118
+ if auto_clean_expired:
119
+ self.delete(key)
120
+ print(f"Cache '{key}' has expired and was automatically cleaned")
121
+ return None
122
+
123
+ file_path = self._get_file_path(key)
124
+ if not file_path.exists():
125
+ return None
126
+
127
+ # Set default CSV parameters
128
+ csv_params = {
129
+ 'encoding': 'utf-8'
130
+ }
131
+ csv_params.update(csv_kwargs)
132
+
133
+ # Read CSV file
134
+ df = pd.read_csv(file_path, **csv_params)
135
+
136
+ # Update last access time
137
+ if key in self.metadata:
138
+ self.metadata[key]['last_accessed'] = datetime.now().isoformat()
139
+ self._save_metadata()
140
+
141
+ return df
142
+
143
+ except Exception as e:
144
+ print(f"Failed to load DataFrame: {e}")
145
+ return None
146
+
147
+ def exists(self, key: str, check_expired: bool = True) -> bool:
148
+ """
149
+ Check if cache exists
150
+
151
+ Args:
152
+ key: Cache key name
153
+ check_expired: Whether to check expiration status
154
+
155
+ Returns:
156
+ bool: Whether cache exists and is not expired
157
+ """
158
+ if check_expired and self._is_expired(key):
159
+ return False
160
+
161
+ file_path = self._get_file_path(key)
162
+ return file_path.exists() and key in self.metadata
163
+
164
+ def delete(self, key: str) -> bool:
165
+ """
166
+ Delete cache
167
+
168
+ Args:
169
+ key: Cache key name
170
+
171
+ Returns:
172
+ bool: Whether deletion was successful
173
+ """
174
+ try:
175
+ file_path = self._get_file_path(key)
176
+
177
+ # Delete CSV file
178
+ if file_path.exists():
179
+ file_path.unlink()
180
+
181
+ # Delete metadata
182
+ if key in self.metadata:
183
+ del self.metadata[key]
184
+ self._save_metadata()
185
+
186
+ return True
187
+
188
+ except Exception as e:
189
+ print(f"Failed to delete cache: {e}")
190
+ return False
191
+
192
+ def clean_expired(self) -> int:
193
+ """
194
+ Clean all expired caches
195
+
196
+ Returns:
197
+ int: Number of cleaned caches
198
+ """
199
+ expired_keys = []
200
+
201
+ for key in list(self.metadata.keys()):
202
+ if self._is_expired(key):
203
+ expired_keys.append(key)
204
+
205
+ cleaned_count = 0
206
+ for key in expired_keys:
207
+ if self.delete(key):
208
+ cleaned_count += 1
209
+
210
+ return cleaned_count
211
+
212
+ def get_info(self, key: str) -> Optional[Dict[str, Any]]:
213
+ """
214
+ Get cache information
215
+
216
+ Args:
217
+ key: Cache key name
218
+
219
+ Returns:
220
+ Optional[Dict]: Cache information including creation time, update time, expiration time, etc.
221
+ """
222
+ if key not in self.metadata:
223
+ return None
224
+
225
+ info = self.metadata[key].copy()
226
+ info['key'] = key
227
+ info['is_expired'] = self._is_expired(key)
228
+ info['file_path'] = str(self._get_file_path(key))
229
+
230
+ return info
231
+
232
+ def list_all(self, include_expired: bool = False) -> Dict[str, Dict[str, Any]]:
233
+ """
234
+ List all caches
235
+
236
+ Args:
237
+ include_expired: Whether to include expired caches
238
+
239
+ Returns:
240
+ Dict: Information of all caches
241
+ """
242
+ result = {}
243
+
244
+ for key in self.metadata:
245
+ if not include_expired and self._is_expired(key):
246
+ continue
247
+
248
+ info = self.get_info(key)
249
+ if info:
250
+ result[key] = info
251
+
252
+ return result
253
+
254
+ def get_cache_stats(self) -> Dict[str, Any]:
255
+ """
256
+ Get cache statistics
257
+
258
+ Returns:
259
+ Dict: Cache statistics information
260
+ """
261
+ total_count = len(self.metadata)
262
+ expired_count = sum(1 for key in self.metadata if self._is_expired(key))
263
+ active_count = total_count - expired_count
264
+
265
+ total_size = 0
266
+ for key in self.metadata:
267
+ file_path = self._get_file_path(key)
268
+ if file_path.exists():
269
+ total_size += file_path.stat().st_size
270
+
271
+ return {
272
+ 'total_count': total_count,
273
+ 'active_count': active_count,
274
+ 'expired_count': expired_count,
275
+ 'total_size_bytes': total_size,
276
+ 'total_size_mb': round(total_size / (1024 * 1024), 2),
277
+ 'cache_dir': str(self.cache_dir)
278
+ }
279
+
280
+ def clear_all(self) -> bool:
281
+ """
282
+ Clear all caches
283
+
284
+ Returns:
285
+ bool: Whether clearing was successful
286
+ """
287
+ try:
288
+ # Delete all CSV files
289
+ for csv_file in self.cache_dir.glob("*.csv"):
290
+ csv_file.unlink()
291
+
292
+ # Clear metadata
293
+ self.metadata = {}
294
+ self._save_metadata()
295
+
296
+ return True
297
+
298
+ except Exception as e:
299
+ print(f"Failed to clear cache: {e}")
300
+ return False
301
+
302
+
303
+ # Create default instance
304
+ default_cache = DataFrameCache()
305
+
306
+
307
+ # Convenience functions
308
+ def save_dataframe(key: str, df: pd.DataFrame, expire_hours: Optional[float] = None,
309
+ **csv_kwargs) -> bool:
310
+ """Convenience function: Save DataFrame"""
311
+ return default_cache.save(key, df, expire_hours, **csv_kwargs)
312
+
313
+
314
+ def load_dataframe(key: str, **csv_kwargs) -> Optional[pd.DataFrame]:
315
+ """Convenience function: Load DataFrame"""
316
+ return default_cache.load(key, **csv_kwargs)
317
+
318
+
319
+ def dataframe_exists(key: str) -> bool:
320
+ """Convenience function: Check if DataFrame exists"""
321
+ return default_cache.exists(key)
322
+
323
+
324
+ def delete_dataframe(key: str) -> bool:
325
+ """Convenience function: Delete DataFrame cache"""
326
+ return default_cache.delete(key)
327
+
328
+
329
+ def clean_expired_dataframes() -> int:
330
+ """Convenience function: Clean expired DataFrame caches"""
331
+ return default_cache.clean_expired()
@@ -0,0 +1,113 @@
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import urllib3
4
+ import random
5
+ import time
6
+
7
+ # Disable SSL warnings (optional, for handling insecure HTTPS)
8
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
9
+
10
+ def get_random_headers():
11
+ """Generate random headers to avoid detection"""
12
+ user_agents = [
13
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
14
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
15
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
16
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
17
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
18
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
19
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
20
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0',
21
+ 'Mozilla/5.0 (X11; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0',
22
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59'
23
+ ]
24
+
25
+ accept_languages = [
26
+ 'en-US,en;q=0.9',
27
+ 'zh-CN,zh;q=0.9,en;q=0.8',
28
+ 'en-GB,en;q=0.9',
29
+ 'fr-FR,fr;q=0.9,en;q=0.8',
30
+ 'de-DE,de;q=0.9,en;q=0.8'
31
+ ]
32
+
33
+ accept_encodings = [
34
+ 'gzip, deflate, br',
35
+ 'gzip, deflate',
36
+ 'br, gzip, deflate'
37
+ ]
38
+
39
+ headers = {
40
+ 'User-Agent': random.choice(user_agents),
41
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
42
+ 'Accept-Language': random.choice(accept_languages),
43
+ 'Accept-Encoding': random.choice(accept_encodings),
44
+ 'Connection': 'keep-alive',
45
+ 'Upgrade-Insecure-Requests': '1',
46
+ }
47
+
48
+ # Randomly add some optional headers
49
+ if random.random() > 0.5:
50
+ headers['DNT'] = '1'
51
+ if random.random() > 0.7:
52
+ headers['Cache-Control'] = 'max-age=0'
53
+ if random.random() > 0.6:
54
+ headers['Sec-Fetch-Dest'] = 'document'
55
+ headers['Sec-Fetch-Mode'] = 'navigate'
56
+ headers['Sec-Fetch-Site'] = 'none'
57
+
58
+ return headers
59
+
60
+ def fetch_webpage_text(url, min_delay=1, max_delay=3):
61
+ """
62
+ Fetch and extract text content from a webpage with randomization
63
+
64
+ Args:
65
+ url (str): The URL to fetch
66
+ min_delay (int): Minimum delay before request (seconds)
67
+ max_delay (int): Maximum delay before request (seconds)
68
+
69
+ Returns:
70
+ str: Extracted text content or error message
71
+ """
72
+ # Add random delay to avoid being detected as bot
73
+ delay = random.uniform(min_delay, max_delay)
74
+ time.sleep(delay)
75
+
76
+ headers = get_random_headers()
77
+
78
+ # Random timeout between 8-15 seconds
79
+ timeout = random.randint(8, 15)
80
+
81
+ try:
82
+ # Send request with random headers and timeout
83
+ response = requests.get(url, headers=headers, timeout=timeout, verify=False)
84
+ response.raise_for_status() # Check if request was successful
85
+ response.encoding = response.apparent_encoding # Auto-detect encoding
86
+
87
+ # Parse HTML using BeautifulSoup
88
+ soup = BeautifulSoup(response.text, 'html.parser')
89
+
90
+ # Remove script, style and navigation elements to avoid interference
91
+ for script_or_style in soup(['script', 'style', 'nav', 'footer', 'header']):
92
+ script_or_style.decompose()
93
+
94
+ # Extract text content
95
+ text = soup.get_text()
96
+
97
+ # Clean whitespace: remove extra blank lines and spaces
98
+ lines = (line.strip() for line in text.splitlines())
99
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
100
+ text = ' '.join(chunk for chunk in chunks if chunk)
101
+
102
+ return text
103
+
104
+ except requests.exceptions.RequestException as e:
105
+ return f"Request failed: {e}"
106
+ except Exception as e:
107
+ return f"Parsing failed: {e}"
108
+
109
+ # Example usage
110
+ if __name__ == "__main__":
111
+ url = "http://finance.eastmoney.com/a/202508133482756869.html"
112
+ text = fetch_webpage_text(url)
113
+ print(text)
@@ -1,9 +1,10 @@
1
1
  import time
2
+ from typing import Optional
2
3
 
3
4
  from loguru import logger
4
5
 
5
6
 
6
- class Timer(object):
7
+ class Timer:
7
8
  def __init__(self, name: str, use_ms: bool = False, stack_level: int = 2):
8
9
  self.name: str = name
9
10
  self.use_ms: bool = use_ms
@@ -15,7 +16,7 @@ class Timer(object):
15
16
 
16
17
  def __enter__(self, *args, **kwargs):
17
18
  self.time_start = time.time()
18
- logger.info(f"---------- enter {self.name} ----------", stacklevel=self.stack_level)
19
+ logger.info(f"========== timer.{self.name} start ==========", stacklevel=self.stack_level)
19
20
  return self
20
21
 
21
22
  def __exit__(self, *args):
@@ -26,10 +27,10 @@ class Timer(object):
26
27
  else:
27
28
  time_str = f"{self.time_cost:.3f}s"
28
29
 
29
- logger.info(f"---------- leave {self.name} [{time_str}] ----------", stacklevel=self.stack_level)
30
+ logger.info(f"========== timer.{self.name} end, time_cost={time_str} ==========", stacklevel=self.stack_level)
30
31
 
31
32
 
32
- def timer(name: str = None, use_ms: bool = False, stack_level: int = 2):
33
+ def timer(name: Optional[str] = None, use_ms: bool = False, stack_level: int = 2):
33
34
  def decorator(func):
34
35
  def wrapper(*args, **kwargs):
35
36
  with Timer(name=name or func.__name__, use_ms=use_ms, stack_level=stack_level + 1):