bioguider 0.2.23__py3-none-any.whl → 0.2.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bioguider might be problematic. Click here for more details.
- bioguider/agents/collection_task.py +8 -3
- bioguider/agents/consistency_query_step.py +3 -0
- bioguider/agents/evaluation_tutorial_task.py +45 -12
- bioguider/agents/evaluation_userguide_task.py +40 -11
- bioguider/utils/file_utils.py +81 -1
- bioguider/utils/utils.py +21 -1
- {bioguider-0.2.23.dist-info → bioguider-0.2.25.dist-info}/METADATA +3 -1
- {bioguider-0.2.23.dist-info → bioguider-0.2.25.dist-info}/RECORD +10 -10
- {bioguider-0.2.23.dist-info → bioguider-0.2.25.dist-info}/LICENSE +0 -0
- {bioguider-0.2.23.dist-info → bioguider-0.2.25.dist-info}/WHEEL +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
|
|
2
2
|
import os
|
|
3
3
|
import logging
|
|
4
|
+
from pathlib import Path
|
|
4
5
|
import re
|
|
5
6
|
import json
|
|
6
7
|
from pydantic import BaseModel, Field
|
|
@@ -23,7 +24,7 @@ from langchain.schema import (
|
|
|
23
24
|
from langgraph.graph import StateGraph, START, END
|
|
24
25
|
|
|
25
26
|
from bioguider.database.summarized_file_db import SummarizedFilesDb
|
|
26
|
-
from bioguider.utils.file_utils import get_file_type
|
|
27
|
+
from bioguider.utils.file_utils import flatten_files, get_file_type
|
|
27
28
|
from bioguider.agents.agent_utils import read_directory, try_parse_json_object
|
|
28
29
|
from bioguider.agents.collection_task_utils import (
|
|
29
30
|
RELATED_FILE_GOAL_ITEM,
|
|
@@ -187,11 +188,15 @@ class CollectionTask(AgentTask):
|
|
|
187
188
|
logger.error(f"Final answer is not a valid JSON: {result}")
|
|
188
189
|
return None
|
|
189
190
|
final_result = the_obj["final_answer"]
|
|
191
|
+
files = None
|
|
190
192
|
if isinstance(final_result, str):
|
|
191
193
|
final_result = final_result.strip()
|
|
192
|
-
|
|
194
|
+
files = [final_result]
|
|
193
195
|
elif isinstance(final_result, list):
|
|
194
|
-
|
|
196
|
+
files = final_result
|
|
195
197
|
else:
|
|
196
198
|
logger.error(f"Final answer is not a valid JSON list or string: {result}")
|
|
197
199
|
return None
|
|
200
|
+
|
|
201
|
+
files = flatten_files(self.repo_path, files)
|
|
202
|
+
return files
|
|
@@ -45,14 +45,17 @@ class ConsistencyQueryStep(CommonStep):
|
|
|
45
45
|
else:
|
|
46
46
|
if file_path is not None and parent is not None:
|
|
47
47
|
rows = self.code_structure_db.select_by_name_and_parent_and_path(name, parent, file_path)
|
|
48
|
+
rows = rows if rows is None else [rows]
|
|
48
49
|
if rows is None or len(rows) == 0:
|
|
49
50
|
rows = self.code_structure_db.select_by_name_and_path(name, file_path)
|
|
51
|
+
rows = rows if rows is None else [rows]
|
|
50
52
|
if rows is None or len(rows) == 0:
|
|
51
53
|
rows = self.code_structure_db.select_by_name_and_parent(name, parent)
|
|
52
54
|
if rows is None or len(rows) == 0:
|
|
53
55
|
rows = self.code_structure_db.select_by_name(name)
|
|
54
56
|
elif file_path is not None:
|
|
55
57
|
rows = self.code_structure_db.select_by_name_and_path(name, file_path)
|
|
58
|
+
rows = rows if rows is None else [rows]
|
|
56
59
|
if rows is None or len(rows) == 0:
|
|
57
60
|
rows = self.code_structure_db.select_by_name(name)
|
|
58
61
|
elif parent is not None:
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Callable
|
|
5
|
+
from typing import Callable, Tuple
|
|
6
6
|
from langchain.prompts import ChatPromptTemplate
|
|
7
7
|
from langchain_openai.chat_models.base import BaseChatOpenAI
|
|
8
8
|
from pydantic import BaseModel, Field
|
|
@@ -16,12 +16,15 @@ from bioguider.agents.collection_task import CollectionTask
|
|
|
16
16
|
from bioguider.agents.evaluation_tutorial_task_prompts import INDIVIDUAL_TUTORIAL_EVALUATION_SYSTEM_PROMPT
|
|
17
17
|
from bioguider.agents.prompt_utils import CollectionGoalItemEnum
|
|
18
18
|
from bioguider.utils.constants import DEFAULT_TOKEN_USAGE, ProjectMetadata
|
|
19
|
+
from bioguider.utils.file_utils import detect_file_type, flatten_files
|
|
19
20
|
from bioguider.utils.notebook_utils import extract_markdown_from_notebook, strip_notebook_to_code_and_markdown
|
|
20
21
|
from bioguider.utils.pyphen_utils import PyphenReadability
|
|
21
|
-
from bioguider.utils.utils import increase_token_usage
|
|
22
|
+
from bioguider.utils.utils import convert_html_to_text, increase_token_usage
|
|
22
23
|
|
|
23
24
|
logger = logging.getLogger(__name__)
|
|
24
25
|
|
|
26
|
+
MAX_FILE_SIZE = 1024 * 100 # 100K
|
|
27
|
+
|
|
25
28
|
class TutorialEvaluationResult(BaseModel):
|
|
26
29
|
overall_score: str=Field(description="A string value, could be `Poor`, `Fair`, `Good`, or `Excellent`")
|
|
27
30
|
overall_key_strengths: str=Field(description="A string value, the key strengths of the tutorial")
|
|
@@ -60,6 +63,40 @@ class EvaluationTutorialTask(EvaluationTask):
|
|
|
60
63
|
self.evaluation_name = "Tutorial Evaluation"
|
|
61
64
|
self.code_structure_db = code_structure_db
|
|
62
65
|
|
|
66
|
+
def _sanitize_files(self, files: list[str]) -> list[str]:
|
|
67
|
+
sanitized_files = []
|
|
68
|
+
for file in files:
|
|
69
|
+
file_path = Path(self.repo_path, file)
|
|
70
|
+
if not file_path.exists() or not file_path.is_file():
|
|
71
|
+
continue
|
|
72
|
+
if detect_file_type(file_path) == "binary":
|
|
73
|
+
continue
|
|
74
|
+
if file.endswith(".svg"):
|
|
75
|
+
continue
|
|
76
|
+
if not file.endswith(".ipynb") and file_path.stat().st_size > MAX_FILE_SIZE:
|
|
77
|
+
continue
|
|
78
|
+
sanitized_files.append(file)
|
|
79
|
+
return sanitized_files
|
|
80
|
+
|
|
81
|
+
def _sanitize_file_content(self, file: str) -> Tuple[str | None, str | None]:
|
|
82
|
+
content = read_file(Path(self.repo_path, file))
|
|
83
|
+
if content is None:
|
|
84
|
+
logger.error(f"Error in reading file {file} - {Path(self.repo_path, file).resolve()}")
|
|
85
|
+
return None, None
|
|
86
|
+
|
|
87
|
+
if file.endswith(".ipynb") or file.endswith(".html") or file.endswith(".htm"):
|
|
88
|
+
if file.endswith(".ipynb"):
|
|
89
|
+
readability_content = extract_markdown_from_notebook(Path(self.repo_path, file))
|
|
90
|
+
content = json.dumps(strip_notebook_to_code_and_markdown(Path(self.repo_path, file)))
|
|
91
|
+
else:
|
|
92
|
+
readability_content = convert_html_to_text(Path(self.repo_path, file))
|
|
93
|
+
content = readability_content
|
|
94
|
+
|
|
95
|
+
content = content.replace("{", "<<").replace("}", ">>")
|
|
96
|
+
else:
|
|
97
|
+
readability_content = content
|
|
98
|
+
return content, readability_content
|
|
99
|
+
|
|
63
100
|
def _collect_files(self):
|
|
64
101
|
task = CollectionTask(
|
|
65
102
|
llm=self.llm,
|
|
@@ -72,6 +109,8 @@ class EvaluationTutorialTask(EvaluationTask):
|
|
|
72
109
|
goal_item=CollectionGoalItemEnum.Tutorial.name,
|
|
73
110
|
)
|
|
74
111
|
files = task.collect()
|
|
112
|
+
files = flatten_files(self.repo_path, files)
|
|
113
|
+
files = self._sanitize_files(files)
|
|
75
114
|
return files
|
|
76
115
|
|
|
77
116
|
def _evaluate_consistency(self, file: str) -> ConsistencyEvaluationResult:
|
|
@@ -100,17 +139,11 @@ class EvaluationTutorialTask(EvaluationTask):
|
|
|
100
139
|
), {**DEFAULT_TOKEN_USAGE}
|
|
101
140
|
|
|
102
141
|
def _evaluate_individual_tutorial(self, file: str) -> tuple[IndividualTutorialEvaluationResult | None, dict]:
|
|
103
|
-
content =
|
|
104
|
-
if content is None:
|
|
105
|
-
logger.error(f"Error in
|
|
142
|
+
content, readability_content = self._sanitize_file_content(file)
|
|
143
|
+
if content is None or readability_content is None:
|
|
144
|
+
logger.error(f"Error in sanitizing file {file} - {Path(self.repo_path, file).resolve()}")
|
|
106
145
|
return None, {**DEFAULT_TOKEN_USAGE}
|
|
107
|
-
|
|
108
|
-
if file.endswith(".ipynb"):
|
|
109
|
-
readability_content = extract_markdown_from_notebook(Path(self.repo_path, file))
|
|
110
|
-
content = json.dumps(strip_notebook_to_code_and_markdown(Path(self.repo_path, file)))
|
|
111
|
-
content = content.replace("{", "<<").replace("}", ">>")
|
|
112
|
-
else:
|
|
113
|
-
readability_content = content
|
|
146
|
+
|
|
114
147
|
readability = PyphenReadability()
|
|
115
148
|
flesch_reading_ease, flesch_kincaid_grade, gunning_fog_index, smog_index, \
|
|
116
149
|
_, _, _, _, _ = readability.readability_metrics(readability_content)
|
|
@@ -12,11 +12,12 @@ from bioguider.agents.prompt_utils import CollectionGoalItemEnum
|
|
|
12
12
|
from bioguider.utils.constants import (
|
|
13
13
|
DEFAULT_TOKEN_USAGE,
|
|
14
14
|
)
|
|
15
|
+
from bioguider.utils.file_utils import detect_file_type, flatten_files
|
|
15
16
|
from ..utils.pyphen_utils import PyphenReadability
|
|
16
17
|
|
|
17
18
|
from .evaluation_task import EvaluationTask
|
|
18
19
|
from .agent_utils import read_file
|
|
19
|
-
from bioguider.utils.utils import increase_token_usage
|
|
20
|
+
from bioguider.utils.utils import convert_html_to_text, increase_token_usage
|
|
20
21
|
from .evaluation_userguide_prompts import INDIVIDUAL_USERGUIDE_EVALUATION_SYSTEM_PROMPT
|
|
21
22
|
|
|
22
23
|
|
|
@@ -37,6 +38,9 @@ class IndividualUserGuideEvaluationResult(BaseModel):
|
|
|
37
38
|
|
|
38
39
|
logger = logging.getLogger(__name__)
|
|
39
40
|
|
|
41
|
+
MAX_FILE_SIZE = 1024 * 100 # 100K
|
|
42
|
+
|
|
43
|
+
|
|
40
44
|
class EvaluationUserGuideTask(EvaluationTask):
|
|
41
45
|
def __init__(
|
|
42
46
|
self,
|
|
@@ -51,6 +55,19 @@ class EvaluationUserGuideTask(EvaluationTask):
|
|
|
51
55
|
super().__init__(llm, repo_path, gitignore_path, meta_data, step_callback, summarized_files_db)
|
|
52
56
|
self.evaluation_name = "User Guide Evaluation"
|
|
53
57
|
self.code_structure_db = code_structure_db
|
|
58
|
+
|
|
59
|
+
def sanitize_files(self, files: list[str]) -> list[str]:
|
|
60
|
+
sanitized_files = []
|
|
61
|
+
for file in files:
|
|
62
|
+
file_path = Path(self.repo_path, file)
|
|
63
|
+
if not file_path.exists() or not file_path.is_file():
|
|
64
|
+
continue
|
|
65
|
+
if detect_file_type(file_path) == "binary":
|
|
66
|
+
continue
|
|
67
|
+
if file_path.stat().st_size > MAX_FILE_SIZE:
|
|
68
|
+
continue
|
|
69
|
+
sanitized_files.append(file)
|
|
70
|
+
return sanitized_files
|
|
54
71
|
|
|
55
72
|
def _collect_files(self):
|
|
56
73
|
task = CollectionTask(
|
|
@@ -64,32 +81,43 @@ class EvaluationUserGuideTask(EvaluationTask):
|
|
|
64
81
|
goal_item=CollectionGoalItemEnum.UserGuide.name,
|
|
65
82
|
)
|
|
66
83
|
files = task.collect()
|
|
84
|
+
files = flatten_files(self.repo_path, files)
|
|
85
|
+
files = self.sanitize_files(files)
|
|
67
86
|
return files
|
|
68
87
|
|
|
69
|
-
def
|
|
88
|
+
def _evaluate_consistency_on_content(self, content: str) -> ConsistencyEvaluationResult:
|
|
70
89
|
consistency_evaluation_task = ConsistencyEvaluationTask(
|
|
71
90
|
llm=self.llm,
|
|
72
91
|
code_structure_db=self.code_structure_db,
|
|
73
92
|
step_callback=self.step_callback,
|
|
74
93
|
)
|
|
75
|
-
file = file.strip()
|
|
76
|
-
with open(Path(self.repo_path, file), "r") as f:
|
|
77
|
-
user_guide_api_documentation = f.read()
|
|
78
94
|
return consistency_evaluation_task.evaluate(
|
|
79
95
|
domain="user guide/API",
|
|
80
|
-
documentation=
|
|
96
|
+
documentation=content,
|
|
81
97
|
), {**DEFAULT_TOKEN_USAGE}
|
|
82
98
|
|
|
99
|
+
def _evaluate_consistency(self, file: str) -> ConsistencyEvaluationResult:
|
|
100
|
+
file = file.strip()
|
|
101
|
+
with open(Path(self.repo_path, file), "r") as f:
|
|
102
|
+
user_guide_api_documentation = f.read()
|
|
103
|
+
return self._evaluate_consistency_on_content(user_guide_api_documentation)
|
|
104
|
+
|
|
83
105
|
def _evaluate_individual_userguide(self, file: str) -> tuple[IndividualUserGuideEvaluationResult | None, dict]:
|
|
84
|
-
content = read_file(Path(self.repo_path, file))
|
|
85
|
-
|
|
106
|
+
content = read_file(Path(self.repo_path, file))
|
|
86
107
|
if content is None:
|
|
87
108
|
logger.error(f"Error in reading file {file}")
|
|
88
109
|
return None, {**DEFAULT_TOKEN_USAGE}
|
|
89
110
|
|
|
111
|
+
if file.endswith(".html") or file.endswith(".htm"):
|
|
112
|
+
readability_content = convert_html_to_text(Path(self.repo_path, file))
|
|
113
|
+
content = readability_content
|
|
114
|
+
content = content.replace("{", "<<").replace("}", ">>")
|
|
115
|
+
else:
|
|
116
|
+
readability_content = content
|
|
117
|
+
|
|
90
118
|
readability = PyphenReadability()
|
|
91
119
|
flesch_reading_ease, flesch_kincaid_grade, gunning_fog_index, smog_index, \
|
|
92
|
-
_, _, _, _, _ = readability.readability_metrics(
|
|
120
|
+
_, _, _, _, _ = readability.readability_metrics(readability_content)
|
|
93
121
|
system_prompt = ChatPromptTemplate.from_template(
|
|
94
122
|
INDIVIDUAL_USERGUIDE_EVALUATION_SYSTEM_PROMPT
|
|
95
123
|
).format(
|
|
@@ -97,7 +125,7 @@ class EvaluationUserGuideTask(EvaluationTask):
|
|
|
97
125
|
flesch_kincaid_grade=flesch_kincaid_grade,
|
|
98
126
|
gunning_fog_index=gunning_fog_index,
|
|
99
127
|
smog_index=smog_index,
|
|
100
|
-
userguide_content=
|
|
128
|
+
userguide_content=readability_content,
|
|
101
129
|
)
|
|
102
130
|
agent = CommonAgentTwoSteps(llm=self.llm)
|
|
103
131
|
res, _, token_usage, reasoning_process = agent.go(
|
|
@@ -107,7 +135,7 @@ class EvaluationUserGuideTask(EvaluationTask):
|
|
|
107
135
|
)
|
|
108
136
|
res: UserGuideEvaluationResult = res
|
|
109
137
|
|
|
110
|
-
consistency_evaluation_result, _temp_token_usage = self.
|
|
138
|
+
consistency_evaluation_result, _temp_token_usage = self._evaluate_consistency_on_content(content)
|
|
111
139
|
if consistency_evaluation_result is None:
|
|
112
140
|
# No sufficient information to evaluate the consistency of the user guide/API documentation
|
|
113
141
|
consistency_evaluation_result = ConsistencyEvaluationResult(
|
|
@@ -124,6 +152,7 @@ class EvaluationUserGuideTask(EvaluationTask):
|
|
|
124
152
|
def _evaluate(self, files: list[str] | None = None) -> tuple[dict[str, IndividualUserGuideEvaluationResult] | None, dict, list[str]]:
|
|
125
153
|
total_token_usage = {**DEFAULT_TOKEN_USAGE}
|
|
126
154
|
user_guide_evaluation_results = {}
|
|
155
|
+
files = flatten_files(self.repo_path, files)
|
|
127
156
|
for file in files:
|
|
128
157
|
if file.endswith(".py") or file.endswith(".R"):
|
|
129
158
|
continue
|
bioguider/utils/file_utils.py
CHANGED
|
@@ -3,6 +3,16 @@ from enum import Enum
|
|
|
3
3
|
import json
|
|
4
4
|
# from adalflow.utils import get_adalflow_default_root_path
|
|
5
5
|
from pathlib import Path
|
|
6
|
+
from typing import Union, List, Optional, Tuple
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import string
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import magic # optional: pip install python-magic
|
|
13
|
+
HAS_MAGIC = True
|
|
14
|
+
except ImportError:
|
|
15
|
+
HAS_MAGIC = False
|
|
6
16
|
|
|
7
17
|
class FileType(Enum):
|
|
8
18
|
unknown = "u"
|
|
@@ -85,7 +95,7 @@ def extract_code_from_notebook(notebook_path: str) -> str:
|
|
|
85
95
|
# Combine all code cells into a single string
|
|
86
96
|
return '\n\n'.join(code_cells)
|
|
87
97
|
|
|
88
|
-
def parse_repo_url(url: str) ->
|
|
98
|
+
def parse_repo_url(url: str) -> Tuple[Optional[str], Optional[str]]:
|
|
89
99
|
"""
|
|
90
100
|
Parses a git repository URL to extract the author/organization and repository name.
|
|
91
101
|
|
|
@@ -122,7 +132,77 @@ def retrieve_data_root_path() -> Path:
|
|
|
122
132
|
root_folder = Path(data_folder, ".adalflow")
|
|
123
133
|
return root_folder.absolute()
|
|
124
134
|
|
|
135
|
+
def detect_file_type(filepath, blocksize=2048, use_magic=True):
|
|
136
|
+
"""
|
|
137
|
+
Detect if a file is text or binary.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
filepath (str): Path to file.
|
|
141
|
+
blocksize (int): Number of bytes to read for inspection.
|
|
142
|
+
use_magic (bool): Use python-magic if available.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
str: "text" or "binary"
|
|
146
|
+
"""
|
|
147
|
+
# Option 1: Use python-magic if available and requested
|
|
148
|
+
if use_magic and HAS_MAGIC:
|
|
149
|
+
try:
|
|
150
|
+
mime = magic.from_file(filepath, mime=True)
|
|
151
|
+
if mime and mime.startswith("text/"):
|
|
152
|
+
return "text"
|
|
153
|
+
return "binary"
|
|
154
|
+
except Exception:
|
|
155
|
+
pass # fallback to heuristic
|
|
156
|
+
|
|
157
|
+
# Option 2: Heuristic detection
|
|
158
|
+
with open(filepath, "rb") as f:
|
|
159
|
+
chunk = f.read(blocksize)
|
|
160
|
+
if not chunk: # empty file → treat as text
|
|
161
|
+
return "text"
|
|
125
162
|
|
|
163
|
+
# Null byte check
|
|
164
|
+
if b"\0" in chunk:
|
|
165
|
+
return "binary"
|
|
126
166
|
|
|
167
|
+
# Check ratio of non-printable characters
|
|
168
|
+
text_chars = bytearray(string.printable, "ascii")
|
|
169
|
+
nontext = chunk.translate(None, text_chars)
|
|
170
|
+
if float(len(nontext)) / len(chunk) > 0.30:
|
|
171
|
+
return "binary"
|
|
127
172
|
|
|
173
|
+
return "text"
|
|
174
|
+
|
|
175
|
+
def flatten_files(repo_path: Union[str, Path], files: Optional[List[str]]) -> List[str]:
|
|
176
|
+
"""
|
|
177
|
+
Flatten directories into individual files.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
repo_path (Union[str, Path]): The root path of the repository
|
|
181
|
+
files (Optional[List[str]]): List of file/directory paths to flatten
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
List[str]: List of individual file paths (directories are expanded to their contents)
|
|
185
|
+
"""
|
|
186
|
+
if files is None:
|
|
187
|
+
return []
|
|
188
|
+
|
|
189
|
+
flattened = []
|
|
190
|
+
repo_path = Path(repo_path)
|
|
191
|
+
|
|
192
|
+
for file_path in files:
|
|
193
|
+
full_path = repo_path / file_path
|
|
194
|
+
|
|
195
|
+
if full_path.is_dir():
|
|
196
|
+
# If it's a directory, recursively get all files in it
|
|
197
|
+
for item in full_path.rglob("*"):
|
|
198
|
+
if item.is_file():
|
|
199
|
+
# Get relative path from repo_path
|
|
200
|
+
rel_path = item.relative_to(repo_path)
|
|
201
|
+
flattened.append(str(rel_path))
|
|
202
|
+
elif full_path.is_file():
|
|
203
|
+
# If it's already a file, just add it
|
|
204
|
+
flattened.append(file_path)
|
|
205
|
+
# Skip if path doesn't exist
|
|
206
|
+
|
|
207
|
+
return flattened
|
|
128
208
|
|
bioguider/utils/utils.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from pathlib import Path
|
|
2
3
|
import re
|
|
3
4
|
import subprocess
|
|
4
5
|
from typing import Optional
|
|
5
6
|
from pydantic import BaseModel
|
|
6
7
|
import tiktoken
|
|
8
|
+
from bs4 import BeautifulSoup
|
|
7
9
|
|
|
8
10
|
from bioguider.utils.constants import DEFAULT_TOKEN_USAGE
|
|
9
11
|
logger = logging.getLogger(__name__)
|
|
@@ -101,4 +103,22 @@ def convert_to_serializable(obj):
|
|
|
101
103
|
elif isinstance(obj, tuple):
|
|
102
104
|
return [convert_to_serializable(item) for item in obj]
|
|
103
105
|
else:
|
|
104
|
-
return obj
|
|
106
|
+
return obj
|
|
107
|
+
|
|
108
|
+
def convert_html_to_text(html_path: str | Path, exclude_tags: list[str] | None = None) -> str:
|
|
109
|
+
"""
|
|
110
|
+
This function is used to convert html string to text, that is,
|
|
111
|
+
extract text from html content, including tables.
|
|
112
|
+
"""
|
|
113
|
+
html_path = Path(html_path)
|
|
114
|
+
if not html_path.exists():
|
|
115
|
+
raise FileNotFoundError(f"File {html_path} does not exist")
|
|
116
|
+
with html_path.open("r", encoding="utf-8") as f:
|
|
117
|
+
html_content = f.read()
|
|
118
|
+
soup = BeautifulSoup(html_content, "html.parser")
|
|
119
|
+
if exclude_tags is not None:
|
|
120
|
+
for tag in exclude_tags:
|
|
121
|
+
for element in soup.find_all(tag):
|
|
122
|
+
element.decompose()
|
|
123
|
+
text = soup.get_text(separator="\n", strip=True)
|
|
124
|
+
return text
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: bioguider
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.25
|
|
4
4
|
Summary: An AI-Powered package to help biomedical developers to generate clear documentation
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Cankun Wang
|
|
@@ -18,6 +18,7 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
18
18
|
Classifier: Programming Language :: Python :: 3.13
|
|
19
19
|
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
20
20
|
Requires-Dist: adalflow (>=1.0.4,<2.0.0)
|
|
21
|
+
Requires-Dist: beautifulsoup4 (>=4.13.3,<5.0.0)
|
|
21
22
|
Requires-Dist: binaryornot (>=0.4.4,<0.5.0)
|
|
22
23
|
Requires-Dist: faiss-cpu (>=1.11.0,<2.0.0)
|
|
23
24
|
Requires-Dist: grandalf (>=0.8,<0.9)
|
|
@@ -36,6 +37,7 @@ Requires-Dist: pyphen (>=0.17.2,<0.18.0)
|
|
|
36
37
|
Requires-Dist: pytest (>=8.3.5,<9.0.0)
|
|
37
38
|
Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
|
|
38
39
|
Requires-Dist: python-iso639 (>=2025.2.18,<2026.0.0)
|
|
40
|
+
Requires-Dist: python-magic (>=0.4.27,<0.5.0)
|
|
39
41
|
Requires-Dist: tenacity (>=9.1.2,<10.0.0)
|
|
40
42
|
Requires-Dist: textstat (>=0.7.6,<0.8.0)
|
|
41
43
|
Requires-Dist: tiktoken (>=0.9.0,<0.10.0)
|
|
@@ -6,7 +6,7 @@ bioguider/agents/agent_utils.py,sha256=Mj6yr_2y4veWokXrXlAsaP38Ez9sdnZruM8Znnpjx
|
|
|
6
6
|
bioguider/agents/collection_execute_step.py,sha256=jE_oSQZI5WDaz0bJjUWoAfqWfVbGUqN--cvITSWCGiI,5614
|
|
7
7
|
bioguider/agents/collection_observe_step.py,sha256=1xOw6N3uIoyh4h4_vcULAc5x5KZ9G-zZo42AhRidyn8,5373
|
|
8
8
|
bioguider/agents/collection_plan_step.py,sha256=Nn0f8AOkEDCDtnhaqE7yCQoi7PVpsHmiUcsIqC0T0dQ,5956
|
|
9
|
-
bioguider/agents/collection_task.py,sha256=
|
|
9
|
+
bioguider/agents/collection_task.py,sha256=rW4lfewetPBKX2dmnfc_yM4bCu2QOl-rpGnrmOYxr3o,8019
|
|
10
10
|
bioguider/agents/collection_task_utils.py,sha256=mCmjHFD4HY1mSwkfqPaJbZ8sm6ijjdhnNKj40xudE98,5424
|
|
11
11
|
bioguider/agents/common_agent.py,sha256=TpfxbYskwuwWrjs1g9RaG7sdA5rOLdiVac7If7uK2sg,4558
|
|
12
12
|
bioguider/agents/common_agent_2step.py,sha256=rGiDzUkmmUIFnmJJxzXK5M5BfIyINHXLZ0pmPRUVqQg,7911
|
|
@@ -16,7 +16,7 @@ bioguider/agents/consistency_collection_step.py,sha256=evgb0W3PD5pXfViuP_0T5LqLn
|
|
|
16
16
|
bioguider/agents/consistency_evaluation_task.py,sha256=_nYPEs3xhj1jraQWMRML_Y6vZJD_zydY4BstQYXmXJk,1908
|
|
17
17
|
bioguider/agents/consistency_evaluation_task_utils.py,sha256=8PC4KS14ek9NJy3bLuhZNmpOUKFx3_06nKXzuTWb0tE,444
|
|
18
18
|
bioguider/agents/consistency_observe_step.py,sha256=0FEtNpkaGcuL30wb2d43uTlSyHJ9Pxttn9r3NzPD0oo,4409
|
|
19
|
-
bioguider/agents/consistency_query_step.py,sha256=
|
|
19
|
+
bioguider/agents/consistency_query_step.py,sha256=SRkw9UiLAhMnbPgexffHsAznrdLAQvIP4KD7S7N-P14,3858
|
|
20
20
|
bioguider/agents/dockergeneration_execute_step.py,sha256=F92jDlkc6KjAvTkX7q1FsCYP8J15SCaNgmwh3YPqfDo,6500
|
|
21
21
|
bioguider/agents/dockergeneration_observe_step.py,sha256=Bo5Td0fzMYLbLki0FvwamzqRFOy4eu3AvIUa8oFApE4,6131
|
|
22
22
|
bioguider/agents/dockergeneration_plan_step.py,sha256=SB8tQM9PkIKsD2o1DFD7bedcxz6r6hSy8n_EVK60Fz0,7235
|
|
@@ -26,10 +26,10 @@ bioguider/agents/evaluation_installation_task.py,sha256=0RNH5NV7YKwn_we_d3IjnFf_
|
|
|
26
26
|
bioguider/agents/evaluation_readme_task.py,sha256=pi3oAGJgZhJgJG1xLgiobrk3Uy2a_JIarD5QSPBkmHA,30647
|
|
27
27
|
bioguider/agents/evaluation_submission_requirements_task.py,sha256=J_6C-M2AfYue2C-gWBHl7KqGrTBuFBn9zmMV5vSRk-U,7834
|
|
28
28
|
bioguider/agents/evaluation_task.py,sha256=uu0BjalctF9hQjGtT53whbeJHv2RVvs8_2woVUmOLRE,8132
|
|
29
|
-
bioguider/agents/evaluation_tutorial_task.py,sha256=
|
|
29
|
+
bioguider/agents/evaluation_tutorial_task.py,sha256=OSYdHBdkKPi61mDx8iL2_YT2KSF3Ea0VPCkJdPEsxag,9919
|
|
30
30
|
bioguider/agents/evaluation_tutorial_task_prompts.py,sha256=WTqIKBI3JErYaiQfLcPNj_hb3D-hSZrae50uJ7526zw,5996
|
|
31
31
|
bioguider/agents/evaluation_userguide_prompts.py,sha256=eyJUx5nUr8v9k0B5GpKDaX2dBxSLVZGA0fwOWS4Uiow,7154
|
|
32
|
-
bioguider/agents/evaluation_userguide_task.py,sha256=
|
|
32
|
+
bioguider/agents/evaluation_userguide_task.py,sha256=tY7veT1LfafJBV7FrJeJYSUu8IwY2Blnu57Z5gEjjrM,7736
|
|
33
33
|
bioguider/agents/identification_execute_step.py,sha256=w3IjL8f2WiHCyiLjVSoySnIAXpi1-hK1DLKCnXbAN2Y,5587
|
|
34
34
|
bioguider/agents/identification_observe_step.py,sha256=Me5mhEM4e7FGnVFcluNtqfhIxzng6guGIu39xi1TrS8,4341
|
|
35
35
|
bioguider/agents/identification_plan_step.py,sha256=owsTK1NZIuiZL7QPVknJyp9TBRK-mhnuf2RwK4YzaxU,5442
|
|
@@ -67,14 +67,14 @@ bioguider/settings.py,sha256=BD_iz9aYarxmWUl0XaKl4-D4oTXMhFzljsXLNn2phis,3143
|
|
|
67
67
|
bioguider/utils/code_structure_builder.py,sha256=PVeooVKxkiBEkX_VEu8h1ACGs-ynrgk_ce2FKJfeeZc,1947
|
|
68
68
|
bioguider/utils/constants.py,sha256=NGmqEgxNDL1fe-htJbtHGcU94EVUK28YAupxGYOJO_c,9012
|
|
69
69
|
bioguider/utils/default.gitignore,sha256=XjPdyO2KV8z8iyuqluaNR_70tBQftMpyKL8HboVNyeI,1605
|
|
70
|
-
bioguider/utils/file_utils.py,sha256=
|
|
70
|
+
bioguider/utils/file_utils.py,sha256=DOWRluneNpGQ4uVwwX9Tp2VzmZ56wIqXKMyjcMH9Bpc,6229
|
|
71
71
|
bioguider/utils/gitignore_checker.py,sha256=pOYUwsS9D5014LxcZb0cj3s2CAYaD2uF_pYJpaNKcho,6532
|
|
72
72
|
bioguider/utils/notebook_utils.py,sha256=SfU1iLuwgbDzNN-TUh_qbnfUSgn-PI6NrK6QfmdpMqQ,4009
|
|
73
73
|
bioguider/utils/pyphen_utils.py,sha256=cdZc3qphkvMDeL5NiZ8Xou13M_uVNP7ifJ-FwxO-0BE,2680
|
|
74
74
|
bioguider/utils/python_file_handler.py,sha256=BERiE2RHxpu3gAzv26jr8ZQetkrtnMZOv9SjpQ7WIdg,2650
|
|
75
75
|
bioguider/utils/r_file_handler.py,sha256=8HpFaYKP8N1nItwr9tOx49m99pcLSt8EUtTNTJ7xNoE,19564
|
|
76
|
-
bioguider/utils/utils.py,sha256=
|
|
77
|
-
bioguider-0.2.
|
|
78
|
-
bioguider-0.2.
|
|
79
|
-
bioguider-0.2.
|
|
80
|
-
bioguider-0.2.
|
|
76
|
+
bioguider/utils/utils.py,sha256=_JOaAtLiQRq5Z1jb6Hb5KIbkYxCvX07qeRN21gKkt_o,4307
|
|
77
|
+
bioguider-0.2.25.dist-info/LICENSE,sha256=qzkvZcKwwA5DuSuhXMOm2LcO6BdEr4V7jwFZVL2-jL4,1065
|
|
78
|
+
bioguider-0.2.25.dist-info/METADATA,sha256=s54DzbxBT5IaNrWZyTwv63-8LV4iciTaK_wC8C34xAs,1962
|
|
79
|
+
bioguider-0.2.25.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
80
|
+
bioguider-0.2.25.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|