flowllm 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowllm/__init__.py +12 -0
- flowllm/app.py +25 -0
- flowllm/config/default_config.yaml +82 -0
- flowllm/config/pydantic_config_parser.py +242 -0
- flowllm/context/base_context.py +59 -0
- flowllm/context/flow_context.py +28 -0
- llmflow/op/prompt_mixin.py → flowllm/context/prompt_handler.py +25 -14
- flowllm/context/registry.py +26 -0
- flowllm/context/service_context.py +103 -0
- flowllm/embedding_model/__init__.py +1 -0
- {llmflow → flowllm}/embedding_model/base_embedding_model.py +2 -2
- {llmflow → flowllm}/embedding_model/openai_compatible_embedding_model.py +8 -8
- flowllm/flow_engine/__init__.py +1 -0
- flowllm/flow_engine/base_flow_engine.py +34 -0
- flowllm/flow_engine/simple_flow_engine.py +213 -0
- flowllm/llm/__init__.py +1 -0
- {llmflow → flowllm}/llm/base_llm.py +16 -24
- {llmflow → flowllm}/llm/openai_compatible_llm.py +64 -108
- flowllm/op/__init__.py +3 -0
- flowllm/op/akshare/get_ak_a_code_op.py +116 -0
- flowllm/op/akshare/get_ak_a_code_prompt.yaml +21 -0
- flowllm/op/akshare/get_ak_a_info_op.py +143 -0
- flowllm/op/base_op.py +169 -0
- flowllm/op/llm_base_op.py +63 -0
- flowllm/op/mock_op.py +42 -0
- flowllm/op/parallel_op.py +30 -0
- flowllm/op/sequential_op.py +29 -0
- flowllm/schema/flow_response.py +12 -0
- flowllm/schema/message.py +35 -0
- flowllm/schema/service_config.py +76 -0
- flowllm/schema/tool_call.py +110 -0
- flowllm/service/__init__.py +2 -0
- flowllm/service/base_service.py +59 -0
- flowllm/service/http_service.py +87 -0
- flowllm/service/mcp_service.py +45 -0
- flowllm/storage/__init__.py +1 -0
- flowllm/storage/vector_store/__init__.py +3 -0
- flowllm/storage/vector_store/base_vector_store.py +44 -0
- {llmflow → flowllm/storage}/vector_store/chroma_vector_store.py +11 -10
- {llmflow → flowllm/storage}/vector_store/es_vector_store.py +10 -9
- llmflow/vector_store/file_vector_store.py → flowllm/storage/vector_store/local_vector_store.py +110 -10
- flowllm/utils/common_utils.py +64 -0
- flowllm/utils/dataframe_cache.py +331 -0
- flowllm/utils/fetch_url.py +113 -0
- {llmflow → flowllm}/utils/timer.py +5 -4
- {flowllm-0.1.0.dist-info → flowllm-0.1.1.dist-info}/METADATA +31 -27
- flowllm-0.1.1.dist-info/RECORD +62 -0
- flowllm-0.1.1.dist-info/entry_points.txt +4 -0
- {flowllm-0.1.0.dist-info → flowllm-0.1.1.dist-info}/licenses/LICENSE +1 -1
- flowllm-0.1.1.dist-info/top_level.txt +1 -0
- flowllm-0.1.0.dist-info/RECORD +0 -66
- flowllm-0.1.0.dist-info/entry_points.txt +0 -3
- flowllm-0.1.0.dist-info/top_level.txt +0 -1
- llmflow/app.py +0 -53
- llmflow/config/config_parser.py +0 -80
- llmflow/config/mock_config.yaml +0 -58
- llmflow/embedding_model/__init__.py +0 -5
- llmflow/enumeration/agent_state.py +0 -8
- llmflow/llm/__init__.py +0 -5
- llmflow/mcp_server.py +0 -110
- llmflow/op/__init__.py +0 -10
- llmflow/op/base_op.py +0 -125
- llmflow/op/mock_op.py +0 -40
- llmflow/op/react/react_v1_op.py +0 -88
- llmflow/op/react/react_v1_prompt.yaml +0 -28
- llmflow/op/vector_store/__init__.py +0 -13
- llmflow/op/vector_store/recall_vector_store_op.py +0 -48
- llmflow/op/vector_store/update_vector_store_op.py +0 -28
- llmflow/op/vector_store/vector_store_action_op.py +0 -46
- llmflow/pipeline/pipeline.py +0 -94
- llmflow/pipeline/pipeline_context.py +0 -37
- llmflow/schema/app_config.py +0 -69
- llmflow/schema/experience.py +0 -144
- llmflow/schema/message.py +0 -68
- llmflow/schema/request.py +0 -32
- llmflow/schema/response.py +0 -29
- llmflow/service/__init__.py +0 -0
- llmflow/service/llmflow_service.py +0 -96
- llmflow/tool/__init__.py +0 -9
- llmflow/tool/base_tool.py +0 -80
- llmflow/tool/code_tool.py +0 -43
- llmflow/tool/dashscope_search_tool.py +0 -162
- llmflow/tool/mcp_tool.py +0 -77
- llmflow/tool/tavily_search_tool.py +0 -109
- llmflow/tool/terminate_tool.py +0 -23
- llmflow/utils/__init__.py +0 -0
- llmflow/utils/common_utils.py +0 -17
- llmflow/utils/file_handler.py +0 -25
- llmflow/utils/http_client.py +0 -156
- llmflow/utils/op_utils.py +0 -102
- llmflow/utils/registry.py +0 -33
- llmflow/vector_store/__init__.py +0 -7
- llmflow/vector_store/base_vector_store.py +0 -136
- {llmflow → flowllm/config}/__init__.py +0 -0
- {llmflow/config → flowllm/context}/__init__.py +0 -0
- {llmflow → flowllm}/enumeration/__init__.py +0 -0
- {llmflow → flowllm}/enumeration/chunk_enum.py +0 -0
- {llmflow → flowllm}/enumeration/http_enum.py +0 -0
- {llmflow → flowllm}/enumeration/role.py +0 -0
- {llmflow/op/react → flowllm/op/akshare}/__init__.py +0 -0
- {llmflow/pipeline → flowllm/schema}/__init__.py +0 -0
- {llmflow → flowllm}/schema/vector_node.py +0 -0
- {llmflow/schema → flowllm/utils}/__init__.py +0 -0
- {llmflow → flowllm}/utils/singleton.py +0 -0
- {flowllm-0.1.0.dist-info → flowllm-0.1.1.dist-info}/WHEEL +0 -0
llmflow/vector_store/file_vector_store.py → flowllm/storage/vector_store/local_vector_store.py
RENAMED
@@ -1,18 +1,20 @@
|
|
1
|
+
import fcntl
|
2
|
+
import json
|
1
3
|
import math
|
2
4
|
from pathlib import Path
|
3
5
|
from typing import List, Iterable
|
4
6
|
|
5
7
|
from loguru import logger
|
6
8
|
from pydantic import Field, model_validator
|
9
|
+
from tqdm import tqdm
|
7
10
|
|
8
|
-
from
|
9
|
-
from
|
10
|
-
from
|
11
|
-
from llmflow.vector_store.base_vector_store import BaseVectorStore
|
11
|
+
from flowllm.context.service_context import C
|
12
|
+
from flowllm.schema.vector_node import VectorNode
|
13
|
+
from flowllm.storage.vector_store.base_vector_store import BaseVectorStore
|
12
14
|
|
13
15
|
|
14
|
-
@
|
15
|
-
class
|
16
|
+
@C.register_vector_store("local")
|
17
|
+
class LocalVectorStore(BaseVectorStore):
|
16
18
|
store_dir: str = Field(default="./file_vector_store")
|
17
19
|
|
18
20
|
@model_validator(mode="after")
|
@@ -21,6 +23,55 @@ class FileVectorStore(BaseVectorStore):
|
|
21
23
|
store_path.mkdir(parents=True, exist_ok=True)
|
22
24
|
return self
|
23
25
|
|
26
|
+
@staticmethod
|
27
|
+
def _load_from_path(workspace_id: str, path: str | Path, callback_fn=None, **kwargs) -> Iterable[VectorNode]:
|
28
|
+
workspace_path = Path(path) / f"{workspace_id}.jsonl"
|
29
|
+
if not workspace_path.exists():
|
30
|
+
logger.warning(f"workspace_path={workspace_path} is not exists!")
|
31
|
+
return
|
32
|
+
|
33
|
+
with workspace_path.open() as f:
|
34
|
+
fcntl.flock(f, fcntl.LOCK_SH)
|
35
|
+
try:
|
36
|
+
for line in tqdm(f, desc="load from path"):
|
37
|
+
if line.strip():
|
38
|
+
node_dict = json.loads(line.strip())
|
39
|
+
if callback_fn:
|
40
|
+
node = callback_fn(node_dict)
|
41
|
+
else:
|
42
|
+
node = VectorNode(**node_dict, **kwargs)
|
43
|
+
node.workspace_id = workspace_id
|
44
|
+
yield node
|
45
|
+
|
46
|
+
finally:
|
47
|
+
fcntl.flock(f, fcntl.LOCK_UN)
|
48
|
+
|
49
|
+
@staticmethod
|
50
|
+
def _dump_to_path(nodes: Iterable[VectorNode], workspace_id: str, path: str | Path = "", callback_fn=None,
|
51
|
+
ensure_ascii: bool = False, **kwargs):
|
52
|
+
dump_path: Path = Path(path)
|
53
|
+
dump_path.mkdir(parents=True, exist_ok=True)
|
54
|
+
dump_file = dump_path / f"{workspace_id}.jsonl"
|
55
|
+
|
56
|
+
count = 0
|
57
|
+
with dump_file.open("w") as f:
|
58
|
+
fcntl.flock(f, fcntl.LOCK_EX)
|
59
|
+
try:
|
60
|
+
for node in tqdm(nodes, desc="dump to path"):
|
61
|
+
node.workspace_id = workspace_id
|
62
|
+
if callback_fn:
|
63
|
+
node_dict = callback_fn(node)
|
64
|
+
else:
|
65
|
+
node_dict = node.model_dump()
|
66
|
+
assert isinstance(node_dict, dict)
|
67
|
+
f.write(json.dumps(node_dict, ensure_ascii=ensure_ascii, **kwargs))
|
68
|
+
f.write("\n")
|
69
|
+
count += 1
|
70
|
+
|
71
|
+
return {"size": count}
|
72
|
+
finally:
|
73
|
+
fcntl.flock(f, fcntl.LOCK_UN)
|
74
|
+
|
24
75
|
@property
|
25
76
|
def store_path(self) -> Path:
|
26
77
|
return Path(self.store_dir)
|
@@ -41,6 +92,54 @@ class FileVectorStore(BaseVectorStore):
|
|
41
92
|
for i, node in enumerate(self._load_from_path(path=self.store_path, workspace_id=workspace_id, **kwargs)):
|
42
93
|
yield node
|
43
94
|
|
95
|
+
def dump_workspace(self, workspace_id: str, path: str | Path = "", callback_fn=None, **kwargs):
|
96
|
+
if not self.exist_workspace(workspace_id=workspace_id, **kwargs):
|
97
|
+
logger.warning(f"workspace_id={workspace_id} is not exist!")
|
98
|
+
return {}
|
99
|
+
|
100
|
+
return self._dump_to_path(nodes=self._iter_workspace_nodes(workspace_id=workspace_id, **kwargs),
|
101
|
+
workspace_id=workspace_id,
|
102
|
+
path=path,
|
103
|
+
callback_fn=callback_fn,
|
104
|
+
**kwargs)
|
105
|
+
|
106
|
+
def load_workspace(self, workspace_id: str, path: str | Path = "", nodes: List[VectorNode] = None, callback_fn=None,
|
107
|
+
**kwargs):
|
108
|
+
if self.exist_workspace(workspace_id, **kwargs):
|
109
|
+
self.delete_workspace(workspace_id=workspace_id, **kwargs)
|
110
|
+
logger.info(f"delete workspace_id={workspace_id}")
|
111
|
+
|
112
|
+
self.create_workspace(workspace_id=workspace_id, **kwargs)
|
113
|
+
|
114
|
+
all_nodes: List[VectorNode] = []
|
115
|
+
if nodes:
|
116
|
+
all_nodes.extend(nodes)
|
117
|
+
for node in self._load_from_path(path=path, workspace_id=workspace_id, callback_fn=callback_fn, **kwargs):
|
118
|
+
all_nodes.append(node)
|
119
|
+
self.insert(nodes=all_nodes, workspace_id=workspace_id, **kwargs)
|
120
|
+
return {"size": len(all_nodes)}
|
121
|
+
|
122
|
+
def copy_workspace(self, src_workspace_id: str, dest_workspace_id: str, **kwargs):
|
123
|
+
if not self.exist_workspace(workspace_id=src_workspace_id, **kwargs):
|
124
|
+
logger.warning(f"src_workspace_id={src_workspace_id} is not exist!")
|
125
|
+
return {}
|
126
|
+
|
127
|
+
if not self.exist_workspace(dest_workspace_id, **kwargs):
|
128
|
+
self.create_workspace(workspace_id=dest_workspace_id, **kwargs)
|
129
|
+
|
130
|
+
nodes = []
|
131
|
+
node_size = 0
|
132
|
+
for node in self._iter_workspace_nodes(workspace_id=src_workspace_id, **kwargs):
|
133
|
+
nodes.append(node)
|
134
|
+
node_size += 1
|
135
|
+
if len(nodes) >= self.batch_size:
|
136
|
+
self.insert(nodes=nodes, workspace_id=dest_workspace_id, **kwargs)
|
137
|
+
nodes.clear()
|
138
|
+
|
139
|
+
if nodes:
|
140
|
+
self.insert(nodes=nodes, workspace_id=dest_workspace_id, **kwargs)
|
141
|
+
return {"size": node_size}
|
142
|
+
|
44
143
|
@staticmethod
|
45
144
|
def calculate_similarity(query_vector: List[float], node_vector: List[float]):
|
46
145
|
assert query_vector, f"query_vector is empty!"
|
@@ -106,12 +205,14 @@ class FileVectorStore(BaseVectorStore):
|
|
106
205
|
|
107
206
|
|
108
207
|
def main():
|
109
|
-
from
|
110
|
-
|
208
|
+
from flowllm.utils.common_utils import load_env
|
209
|
+
from flowllm.embedding_model import OpenAICompatibleEmbeddingModel
|
210
|
+
|
211
|
+
load_env()
|
111
212
|
|
112
213
|
embedding_model = OpenAICompatibleEmbeddingModel(dimensions=64, model_name="text-embedding-v4")
|
113
214
|
workspace_id = "rag_nodes_index"
|
114
|
-
client =
|
215
|
+
client = LocalVectorStore(embedding_model=embedding_model)
|
115
216
|
client.delete_workspace(workspace_id)
|
116
217
|
client.create_workspace(workspace_id)
|
117
218
|
|
@@ -160,4 +261,3 @@ def main():
|
|
160
261
|
|
161
262
|
if __name__ == "__main__":
|
162
263
|
main()
|
163
|
-
# launch with: python -m llmflow.storage.file_vector_store
|
@@ -0,0 +1,64 @@
|
|
1
|
+
import os
|
2
|
+
import re
|
3
|
+
from pathlib import Path
|
4
|
+
|
5
|
+
from loguru import logger
|
6
|
+
|
7
|
+
|
8
|
+
def camel_to_snake(content: str) -> str:
|
9
|
+
"""
|
10
|
+
BaseWorker -> base_worker
|
11
|
+
"""
|
12
|
+
snake_str = re.sub(r'(?<!^)(?=[A-Z])', '_', content).lower()
|
13
|
+
return snake_str
|
14
|
+
|
15
|
+
|
16
|
+
def snake_to_camel(content: str) -> str:
|
17
|
+
"""
|
18
|
+
base_worker -> BaseWorker
|
19
|
+
"""
|
20
|
+
camel_str = "".join(x.capitalize() for x in content.split("_"))
|
21
|
+
return camel_str
|
22
|
+
|
23
|
+
|
24
|
+
def _load_env(path: Path):
|
25
|
+
with path.open() as f:
|
26
|
+
for line in f:
|
27
|
+
line = line.strip()
|
28
|
+
if line.startswith("#"):
|
29
|
+
continue
|
30
|
+
|
31
|
+
line_split = line.strip().split("=", 1)
|
32
|
+
if len(line_split) >= 2:
|
33
|
+
key = line_split[0].strip()
|
34
|
+
value = line_split[1].strip()
|
35
|
+
os.environ[key] = value
|
36
|
+
|
37
|
+
|
38
|
+
def load_env(path: str | Path = None):
|
39
|
+
if path is not None:
|
40
|
+
path = Path(path)
|
41
|
+
if path.exists():
|
42
|
+
_load_env(path)
|
43
|
+
else:
|
44
|
+
path1 = Path(".env")
|
45
|
+
path2 = Path("../.env")
|
46
|
+
path3 = Path("../../.env")
|
47
|
+
path4 = Path("../../../.env")
|
48
|
+
path5 = Path("../../../.env")
|
49
|
+
|
50
|
+
if path1.exists():
|
51
|
+
path = path1
|
52
|
+
elif path2.exists():
|
53
|
+
path = path2
|
54
|
+
elif path3.exists():
|
55
|
+
path = path3
|
56
|
+
elif path4.exists():
|
57
|
+
path = path4
|
58
|
+
elif path5.exists():
|
59
|
+
path = path5
|
60
|
+
else:
|
61
|
+
raise FileNotFoundError(".env not found")
|
62
|
+
|
63
|
+
logger.info(f"using path={path}")
|
64
|
+
_load_env(path)
|
@@ -0,0 +1,331 @@
|
|
1
|
+
"""
|
2
|
+
DataFrame cache utility that supports local CSV file storage and reading with data expiration functionality
|
3
|
+
"""
|
4
|
+
|
5
|
+
import json
|
6
|
+
from datetime import datetime, timedelta
|
7
|
+
from pathlib import Path
|
8
|
+
from typing import Optional, Dict, Any
|
9
|
+
|
10
|
+
import pandas as pd
|
11
|
+
|
12
|
+
|
13
|
+
class DataFrameCache:
|
14
|
+
"""
|
15
|
+
DataFrame cache utility class
|
16
|
+
|
17
|
+
Features:
|
18
|
+
- Support for pandas DataFrame local CSV storage and reading
|
19
|
+
- Support for data expiration time settings
|
20
|
+
- Automatic cleanup of expired data
|
21
|
+
- Recording and managing update timestamps
|
22
|
+
"""
|
23
|
+
|
24
|
+
def __init__(self, cache_dir: str = "cache_df"):
|
25
|
+
self.cache_dir = Path(cache_dir)
|
26
|
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
27
|
+
self.metadata_file = self.cache_dir / "metadata.json"
|
28
|
+
self.metadata = {}
|
29
|
+
self._load_metadata()
|
30
|
+
|
31
|
+
def _load_metadata(self):
|
32
|
+
"""Load metadata"""
|
33
|
+
if self.metadata_file.exists():
|
34
|
+
with open(self.metadata_file) as f:
|
35
|
+
self.metadata = json.load(f)
|
36
|
+
|
37
|
+
def _save_metadata(self):
|
38
|
+
"""Save metadata"""
|
39
|
+
with open(self.metadata_file, "w") as f:
|
40
|
+
json.dump(self.metadata, f, ensure_ascii=False, indent=2)
|
41
|
+
|
42
|
+
def _get_file_path(self, key: str) -> Path:
|
43
|
+
"""Get data file path"""
|
44
|
+
return self.cache_dir / f"{key}.csv"
|
45
|
+
|
46
|
+
def _is_expired(self, key: str) -> bool:
|
47
|
+
"""Check if data is expired"""
|
48
|
+
if key not in self.metadata:
|
49
|
+
return True
|
50
|
+
|
51
|
+
expire_time_str = self.metadata[key].get('expire_time')
|
52
|
+
if not expire_time_str:
|
53
|
+
return False # No expiration time set, never expires
|
54
|
+
|
55
|
+
expire_time = datetime.fromisoformat(expire_time_str)
|
56
|
+
return datetime.now() > expire_time
|
57
|
+
|
58
|
+
def save(self, key: str, df: pd.DataFrame, expire_hours: Optional[float] = None,
|
59
|
+
**csv_kwargs) -> bool:
|
60
|
+
"""
|
61
|
+
Save DataFrame to cache
|
62
|
+
|
63
|
+
Args:
|
64
|
+
key: Cache key name
|
65
|
+
df: DataFrame to save
|
66
|
+
expire_hours: Expiration time in hours, None means never expires
|
67
|
+
**csv_kwargs: Additional parameters passed to pandas to_csv
|
68
|
+
|
69
|
+
Returns:
|
70
|
+
bool: Whether save was successful
|
71
|
+
"""
|
72
|
+
try:
|
73
|
+
file_path = self._get_file_path(key)
|
74
|
+
|
75
|
+
# Set default CSV parameters
|
76
|
+
csv_params = {
|
77
|
+
"index": False,
|
78
|
+
"encoding": "utf-8"
|
79
|
+
}
|
80
|
+
csv_params.update(csv_kwargs)
|
81
|
+
|
82
|
+
# Save CSV file
|
83
|
+
df.to_csv(file_path, **csv_params)
|
84
|
+
|
85
|
+
# Update metadata
|
86
|
+
current_time = datetime.now()
|
87
|
+
self.metadata[key] = {
|
88
|
+
'created_time': current_time.isoformat(),
|
89
|
+
'updated_time': current_time.isoformat(),
|
90
|
+
'expire_time': (current_time + timedelta(hours=expire_hours)).isoformat() if expire_hours else None,
|
91
|
+
'file_size': file_path.stat().st_size,
|
92
|
+
'row_count': len(df),
|
93
|
+
'column_count': len(df.columns)
|
94
|
+
}
|
95
|
+
|
96
|
+
self._save_metadata()
|
97
|
+
return True
|
98
|
+
|
99
|
+
except Exception as e:
|
100
|
+
print(f"Failed to save DataFrame: {e}")
|
101
|
+
return False
|
102
|
+
|
103
|
+
def load(self, key: str, auto_clean_expired: bool = True, **csv_kwargs) -> Optional[pd.DataFrame]:
|
104
|
+
"""
|
105
|
+
Load DataFrame from cache
|
106
|
+
|
107
|
+
Args:
|
108
|
+
key: Cache key name
|
109
|
+
auto_clean_expired: Whether to automatically clean expired data
|
110
|
+
**csv_kwargs: Additional parameters passed to pandas read_csv
|
111
|
+
|
112
|
+
Returns:
|
113
|
+
Optional[pd.DataFrame]: Loaded DataFrame, returns None if not exists or expired
|
114
|
+
"""
|
115
|
+
try:
|
116
|
+
# Check if expired
|
117
|
+
if self._is_expired(key):
|
118
|
+
if auto_clean_expired:
|
119
|
+
self.delete(key)
|
120
|
+
print(f"Cache '{key}' has expired and was automatically cleaned")
|
121
|
+
return None
|
122
|
+
|
123
|
+
file_path = self._get_file_path(key)
|
124
|
+
if not file_path.exists():
|
125
|
+
return None
|
126
|
+
|
127
|
+
# Set default CSV parameters
|
128
|
+
csv_params = {
|
129
|
+
'encoding': 'utf-8'
|
130
|
+
}
|
131
|
+
csv_params.update(csv_kwargs)
|
132
|
+
|
133
|
+
# Read CSV file
|
134
|
+
df = pd.read_csv(file_path, **csv_params)
|
135
|
+
|
136
|
+
# Update last access time
|
137
|
+
if key in self.metadata:
|
138
|
+
self.metadata[key]['last_accessed'] = datetime.now().isoformat()
|
139
|
+
self._save_metadata()
|
140
|
+
|
141
|
+
return df
|
142
|
+
|
143
|
+
except Exception as e:
|
144
|
+
print(f"Failed to load DataFrame: {e}")
|
145
|
+
return None
|
146
|
+
|
147
|
+
def exists(self, key: str, check_expired: bool = True) -> bool:
|
148
|
+
"""
|
149
|
+
Check if cache exists
|
150
|
+
|
151
|
+
Args:
|
152
|
+
key: Cache key name
|
153
|
+
check_expired: Whether to check expiration status
|
154
|
+
|
155
|
+
Returns:
|
156
|
+
bool: Whether cache exists and is not expired
|
157
|
+
"""
|
158
|
+
if check_expired and self._is_expired(key):
|
159
|
+
return False
|
160
|
+
|
161
|
+
file_path = self._get_file_path(key)
|
162
|
+
return file_path.exists() and key in self.metadata
|
163
|
+
|
164
|
+
def delete(self, key: str) -> bool:
|
165
|
+
"""
|
166
|
+
Delete cache
|
167
|
+
|
168
|
+
Args:
|
169
|
+
key: Cache key name
|
170
|
+
|
171
|
+
Returns:
|
172
|
+
bool: Whether deletion was successful
|
173
|
+
"""
|
174
|
+
try:
|
175
|
+
file_path = self._get_file_path(key)
|
176
|
+
|
177
|
+
# Delete CSV file
|
178
|
+
if file_path.exists():
|
179
|
+
file_path.unlink()
|
180
|
+
|
181
|
+
# Delete metadata
|
182
|
+
if key in self.metadata:
|
183
|
+
del self.metadata[key]
|
184
|
+
self._save_metadata()
|
185
|
+
|
186
|
+
return True
|
187
|
+
|
188
|
+
except Exception as e:
|
189
|
+
print(f"Failed to delete cache: {e}")
|
190
|
+
return False
|
191
|
+
|
192
|
+
def clean_expired(self) -> int:
|
193
|
+
"""
|
194
|
+
Clean all expired caches
|
195
|
+
|
196
|
+
Returns:
|
197
|
+
int: Number of cleaned caches
|
198
|
+
"""
|
199
|
+
expired_keys = []
|
200
|
+
|
201
|
+
for key in list(self.metadata.keys()):
|
202
|
+
if self._is_expired(key):
|
203
|
+
expired_keys.append(key)
|
204
|
+
|
205
|
+
cleaned_count = 0
|
206
|
+
for key in expired_keys:
|
207
|
+
if self.delete(key):
|
208
|
+
cleaned_count += 1
|
209
|
+
|
210
|
+
return cleaned_count
|
211
|
+
|
212
|
+
def get_info(self, key: str) -> Optional[Dict[str, Any]]:
|
213
|
+
"""
|
214
|
+
Get cache information
|
215
|
+
|
216
|
+
Args:
|
217
|
+
key: Cache key name
|
218
|
+
|
219
|
+
Returns:
|
220
|
+
Optional[Dict]: Cache information including creation time, update time, expiration time, etc.
|
221
|
+
"""
|
222
|
+
if key not in self.metadata:
|
223
|
+
return None
|
224
|
+
|
225
|
+
info = self.metadata[key].copy()
|
226
|
+
info['key'] = key
|
227
|
+
info['is_expired'] = self._is_expired(key)
|
228
|
+
info['file_path'] = str(self._get_file_path(key))
|
229
|
+
|
230
|
+
return info
|
231
|
+
|
232
|
+
def list_all(self, include_expired: bool = False) -> Dict[str, Dict[str, Any]]:
|
233
|
+
"""
|
234
|
+
List all caches
|
235
|
+
|
236
|
+
Args:
|
237
|
+
include_expired: Whether to include expired caches
|
238
|
+
|
239
|
+
Returns:
|
240
|
+
Dict: Information of all caches
|
241
|
+
"""
|
242
|
+
result = {}
|
243
|
+
|
244
|
+
for key in self.metadata:
|
245
|
+
if not include_expired and self._is_expired(key):
|
246
|
+
continue
|
247
|
+
|
248
|
+
info = self.get_info(key)
|
249
|
+
if info:
|
250
|
+
result[key] = info
|
251
|
+
|
252
|
+
return result
|
253
|
+
|
254
|
+
def get_cache_stats(self) -> Dict[str, Any]:
|
255
|
+
"""
|
256
|
+
Get cache statistics
|
257
|
+
|
258
|
+
Returns:
|
259
|
+
Dict: Cache statistics information
|
260
|
+
"""
|
261
|
+
total_count = len(self.metadata)
|
262
|
+
expired_count = sum(1 for key in self.metadata if self._is_expired(key))
|
263
|
+
active_count = total_count - expired_count
|
264
|
+
|
265
|
+
total_size = 0
|
266
|
+
for key in self.metadata:
|
267
|
+
file_path = self._get_file_path(key)
|
268
|
+
if file_path.exists():
|
269
|
+
total_size += file_path.stat().st_size
|
270
|
+
|
271
|
+
return {
|
272
|
+
'total_count': total_count,
|
273
|
+
'active_count': active_count,
|
274
|
+
'expired_count': expired_count,
|
275
|
+
'total_size_bytes': total_size,
|
276
|
+
'total_size_mb': round(total_size / (1024 * 1024), 2),
|
277
|
+
'cache_dir': str(self.cache_dir)
|
278
|
+
}
|
279
|
+
|
280
|
+
def clear_all(self) -> bool:
|
281
|
+
"""
|
282
|
+
Clear all caches
|
283
|
+
|
284
|
+
Returns:
|
285
|
+
bool: Whether clearing was successful
|
286
|
+
"""
|
287
|
+
try:
|
288
|
+
# Delete all CSV files
|
289
|
+
for csv_file in self.cache_dir.glob("*.csv"):
|
290
|
+
csv_file.unlink()
|
291
|
+
|
292
|
+
# Clear metadata
|
293
|
+
self.metadata = {}
|
294
|
+
self._save_metadata()
|
295
|
+
|
296
|
+
return True
|
297
|
+
|
298
|
+
except Exception as e:
|
299
|
+
print(f"Failed to clear cache: {e}")
|
300
|
+
return False
|
301
|
+
|
302
|
+
|
303
|
+
# Create default instance
|
304
|
+
default_cache = DataFrameCache()
|
305
|
+
|
306
|
+
|
307
|
+
# Convenience functions
|
308
|
+
def save_dataframe(key: str, df: pd.DataFrame, expire_hours: Optional[float] = None,
|
309
|
+
**csv_kwargs) -> bool:
|
310
|
+
"""Convenience function: Save DataFrame"""
|
311
|
+
return default_cache.save(key, df, expire_hours, **csv_kwargs)
|
312
|
+
|
313
|
+
|
314
|
+
def load_dataframe(key: str, **csv_kwargs) -> Optional[pd.DataFrame]:
|
315
|
+
"""Convenience function: Load DataFrame"""
|
316
|
+
return default_cache.load(key, **csv_kwargs)
|
317
|
+
|
318
|
+
|
319
|
+
def dataframe_exists(key: str) -> bool:
|
320
|
+
"""Convenience function: Check if DataFrame exists"""
|
321
|
+
return default_cache.exists(key)
|
322
|
+
|
323
|
+
|
324
|
+
def delete_dataframe(key: str) -> bool:
|
325
|
+
"""Convenience function: Delete DataFrame cache"""
|
326
|
+
return default_cache.delete(key)
|
327
|
+
|
328
|
+
|
329
|
+
def clean_expired_dataframes() -> int:
|
330
|
+
"""Convenience function: Clean expired DataFrame caches"""
|
331
|
+
return default_cache.clean_expired()
|
@@ -0,0 +1,113 @@
|
|
1
|
+
import requests
|
2
|
+
from bs4 import BeautifulSoup
|
3
|
+
import urllib3
|
4
|
+
import random
|
5
|
+
import time
|
6
|
+
|
7
|
+
# Disable SSL warnings (optional, for handling insecure HTTPS)
|
8
|
+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
9
|
+
|
10
|
+
def get_random_headers():
|
11
|
+
"""Generate random headers to avoid detection"""
|
12
|
+
user_agents = [
|
13
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
14
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
|
15
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
|
16
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
17
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
|
18
|
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
19
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
|
20
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0',
|
21
|
+
'Mozilla/5.0 (X11; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0',
|
22
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59'
|
23
|
+
]
|
24
|
+
|
25
|
+
accept_languages = [
|
26
|
+
'en-US,en;q=0.9',
|
27
|
+
'zh-CN,zh;q=0.9,en;q=0.8',
|
28
|
+
'en-GB,en;q=0.9',
|
29
|
+
'fr-FR,fr;q=0.9,en;q=0.8',
|
30
|
+
'de-DE,de;q=0.9,en;q=0.8'
|
31
|
+
]
|
32
|
+
|
33
|
+
accept_encodings = [
|
34
|
+
'gzip, deflate, br',
|
35
|
+
'gzip, deflate',
|
36
|
+
'br, gzip, deflate'
|
37
|
+
]
|
38
|
+
|
39
|
+
headers = {
|
40
|
+
'User-Agent': random.choice(user_agents),
|
41
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
42
|
+
'Accept-Language': random.choice(accept_languages),
|
43
|
+
'Accept-Encoding': random.choice(accept_encodings),
|
44
|
+
'Connection': 'keep-alive',
|
45
|
+
'Upgrade-Insecure-Requests': '1',
|
46
|
+
}
|
47
|
+
|
48
|
+
# Randomly add some optional headers
|
49
|
+
if random.random() > 0.5:
|
50
|
+
headers['DNT'] = '1'
|
51
|
+
if random.random() > 0.7:
|
52
|
+
headers['Cache-Control'] = 'max-age=0'
|
53
|
+
if random.random() > 0.6:
|
54
|
+
headers['Sec-Fetch-Dest'] = 'document'
|
55
|
+
headers['Sec-Fetch-Mode'] = 'navigate'
|
56
|
+
headers['Sec-Fetch-Site'] = 'none'
|
57
|
+
|
58
|
+
return headers
|
59
|
+
|
60
|
+
def fetch_webpage_text(url, min_delay=1, max_delay=3):
|
61
|
+
"""
|
62
|
+
Fetch and extract text content from a webpage with randomization
|
63
|
+
|
64
|
+
Args:
|
65
|
+
url (str): The URL to fetch
|
66
|
+
min_delay (int): Minimum delay before request (seconds)
|
67
|
+
max_delay (int): Maximum delay before request (seconds)
|
68
|
+
|
69
|
+
Returns:
|
70
|
+
str: Extracted text content or error message
|
71
|
+
"""
|
72
|
+
# Add random delay to avoid being detected as bot
|
73
|
+
delay = random.uniform(min_delay, max_delay)
|
74
|
+
time.sleep(delay)
|
75
|
+
|
76
|
+
headers = get_random_headers()
|
77
|
+
|
78
|
+
# Random timeout between 8-15 seconds
|
79
|
+
timeout = random.randint(8, 15)
|
80
|
+
|
81
|
+
try:
|
82
|
+
# Send request with random headers and timeout
|
83
|
+
response = requests.get(url, headers=headers, timeout=timeout, verify=False)
|
84
|
+
response.raise_for_status() # Check if request was successful
|
85
|
+
response.encoding = response.apparent_encoding # Auto-detect encoding
|
86
|
+
|
87
|
+
# Parse HTML using BeautifulSoup
|
88
|
+
soup = BeautifulSoup(response.text, 'html.parser')
|
89
|
+
|
90
|
+
# Remove script, style and navigation elements to avoid interference
|
91
|
+
for script_or_style in soup(['script', 'style', 'nav', 'footer', 'header']):
|
92
|
+
script_or_style.decompose()
|
93
|
+
|
94
|
+
# Extract text content
|
95
|
+
text = soup.get_text()
|
96
|
+
|
97
|
+
# Clean whitespace: remove extra blank lines and spaces
|
98
|
+
lines = (line.strip() for line in text.splitlines())
|
99
|
+
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
100
|
+
text = ' '.join(chunk for chunk in chunks if chunk)
|
101
|
+
|
102
|
+
return text
|
103
|
+
|
104
|
+
except requests.exceptions.RequestException as e:
|
105
|
+
return f"Request failed: {e}"
|
106
|
+
except Exception as e:
|
107
|
+
return f"Parsing failed: {e}"
|
108
|
+
|
109
|
+
# Example usage
|
110
|
+
if __name__ == "__main__":
|
111
|
+
url = "http://finance.eastmoney.com/a/202508133482756869.html"
|
112
|
+
text = fetch_webpage_text(url)
|
113
|
+
print(text)
|
@@ -1,9 +1,10 @@
|
|
1
1
|
import time
|
2
|
+
from typing import Optional
|
2
3
|
|
3
4
|
from loguru import logger
|
4
5
|
|
5
6
|
|
6
|
-
class Timer
|
7
|
+
class Timer:
|
7
8
|
def __init__(self, name: str, use_ms: bool = False, stack_level: int = 2):
|
8
9
|
self.name: str = name
|
9
10
|
self.use_ms: bool = use_ms
|
@@ -15,7 +16,7 @@ class Timer(object):
|
|
15
16
|
|
16
17
|
def __enter__(self, *args, **kwargs):
|
17
18
|
self.time_start = time.time()
|
18
|
-
logger.info(f"
|
19
|
+
logger.info(f"========== timer.{self.name} start ==========", stacklevel=self.stack_level)
|
19
20
|
return self
|
20
21
|
|
21
22
|
def __exit__(self, *args):
|
@@ -26,10 +27,10 @@ class Timer(object):
|
|
26
27
|
else:
|
27
28
|
time_str = f"{self.time_cost:.3f}s"
|
28
29
|
|
29
|
-
logger.info(f"
|
30
|
+
logger.info(f"========== timer.{self.name} end, time_cost={time_str} ==========", stacklevel=self.stack_level)
|
30
31
|
|
31
32
|
|
32
|
-
def timer(name: str = None, use_ms: bool = False, stack_level: int = 2):
|
33
|
+
def timer(name: Optional[str] = None, use_ms: bool = False, stack_level: int = 2):
|
33
34
|
def decorator(func):
|
34
35
|
def wrapper(*args, **kwargs):
|
35
36
|
with Timer(name=name or func.__name__, use_ms=use_ms, stack_level=stack_level + 1):
|