user-simulator 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- user_sim/__init__.py +0 -0
- user_sim/cli/__init__.py +0 -0
- user_sim/cli/gen_user_profile.py +34 -0
- user_sim/cli/init_project.py +65 -0
- user_sim/cli/sensei_chat.py +481 -0
- user_sim/cli/sensei_check.py +103 -0
- user_sim/cli/validation_check.py +143 -0
- user_sim/core/__init__.py +0 -0
- user_sim/core/ask_about.py +665 -0
- user_sim/core/data_extraction.py +260 -0
- user_sim/core/data_gathering.py +134 -0
- user_sim/core/interaction_styles.py +147 -0
- user_sim/core/role_structure.py +608 -0
- user_sim/core/user_simulator.py +302 -0
- user_sim/handlers/__init__.py +0 -0
- user_sim/handlers/asr_module.py +128 -0
- user_sim/handlers/html_parser_module.py +202 -0
- user_sim/handlers/image_recognition_module.py +139 -0
- user_sim/handlers/pdf_parser_module.py +123 -0
- user_sim/utils/__init__.py +0 -0
- user_sim/utils/config.py +47 -0
- user_sim/utils/cost_tracker.py +153 -0
- user_sim/utils/cost_tracker_v2.py +193 -0
- user_sim/utils/errors.py +15 -0
- user_sim/utils/exceptions.py +47 -0
- user_sim/utils/languages.py +78 -0
- user_sim/utils/register_management.py +62 -0
- user_sim/utils/show_logs.py +63 -0
- user_sim/utils/token_cost_calculator.py +338 -0
- user_sim/utils/url_management.py +60 -0
- user_sim/utils/utilities.py +568 -0
- user_simulator-0.1.0.dist-info/METADATA +733 -0
- user_simulator-0.1.0.dist-info/RECORD +37 -0
- user_simulator-0.1.0.dist-info/WHEEL +5 -0
- user_simulator-0.1.0.dist-info/entry_points.txt +6 -0
- user_simulator-0.1.0.dist-info/licenses/LICENSE.txt +21 -0
- user_simulator-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,302 @@
|
|
1
|
+
from .data_extraction import DataExtraction
|
2
|
+
from user_sim.utils.utilities import *
|
3
|
+
from .data_gathering import *
|
4
|
+
from langchain_core.prompts import PromptTemplate
|
5
|
+
from langchain_core.output_parsers import StrOutputParser
|
6
|
+
from langchain.chat_models import init_chat_model
|
7
|
+
from user_sim.utils.token_cost_calculator import calculate_cost, max_input_tokens_allowed, max_output_tokens_allowed, invoke_llm
|
8
|
+
from user_sim.utils import config
|
9
|
+
|
10
|
+
import logging
|
11
|
+
|
12
|
+
parser = StrOutputParser()
|
13
|
+
logger = logging.getLogger('Info Logger')
|
14
|
+
|
15
|
+
|
16
|
+
class UserChain:
|
17
|
+
|
18
|
+
def __init__(self, user_role, temp):
|
19
|
+
self.user_role = user_role
|
20
|
+
self.user_llm = None
|
21
|
+
self.model = None
|
22
|
+
self.model_provider = None
|
23
|
+
self.temperature = temp
|
24
|
+
self.init_user_module()
|
25
|
+
self.user_context = PromptTemplate(
|
26
|
+
input_variables=["reminder", "history"],
|
27
|
+
template=self.set_role_template()
|
28
|
+
)
|
29
|
+
self.chain = None
|
30
|
+
|
31
|
+
def init_user_module(self):
|
32
|
+
self.model = config.model
|
33
|
+
self.model_provider = config.model_provider
|
34
|
+
|
35
|
+
if self.model_provider is None:
|
36
|
+
params = {
|
37
|
+
"model": self.model,
|
38
|
+
"temperature": self.temperature
|
39
|
+
}
|
40
|
+
else:
|
41
|
+
params = {
|
42
|
+
"model": self.model,
|
43
|
+
"model_provider": self.model_provider,
|
44
|
+
"temperature": self.temperature
|
45
|
+
}
|
46
|
+
|
47
|
+
self.user_llm = init_chat_model(**params)
|
48
|
+
|
49
|
+
def set_role_template(self):
|
50
|
+
reminder = """{reminder}"""
|
51
|
+
history = """History of the conversation so far: {history}"""
|
52
|
+
role_prompt = self.user_role + reminder + history
|
53
|
+
return role_prompt
|
54
|
+
|
55
|
+
@staticmethod
|
56
|
+
def parse_history(conversation_history):
|
57
|
+
lines = []
|
58
|
+
for inp in conversation_history['interaction']:
|
59
|
+
for k, v in inp.items():
|
60
|
+
lines.append(f"{k}: {v}")
|
61
|
+
return "\n".join(lines)
|
62
|
+
|
63
|
+
def text_method(self, conversation_history, reminder):
|
64
|
+
history = self.parse_history(conversation_history) # formats list to str
|
65
|
+
# input_params = {'history': history, 'reminder': reminder}
|
66
|
+
# invoke_llm(self.user_llm, self.user_context, input_params, self.model, module="user_simulator", parser=True)
|
67
|
+
|
68
|
+
if max_input_tokens_allowed(history+reminder, model_used=self.model):
|
69
|
+
logger.error(f"Token limit was surpassed")
|
70
|
+
return "exit"
|
71
|
+
|
72
|
+
if config.token_count_enabled:
|
73
|
+
self.user_llm.max_tokens = max_output_tokens_allowed(self.model)
|
74
|
+
|
75
|
+
self.chain = self.user_context | self.user_llm | parser
|
76
|
+
|
77
|
+
response = self.chain.invoke({'history': history, 'reminder': reminder})
|
78
|
+
if config.token_count_enabled:
|
79
|
+
calculate_cost(history + reminder, response, self.model, module="user_simulator")
|
80
|
+
return response
|
81
|
+
|
82
|
+
def invoke(self, conversation_history, reminder):
|
83
|
+
|
84
|
+
response = self.text_method(conversation_history, reminder)
|
85
|
+
|
86
|
+
return response
|
87
|
+
|
88
|
+
class UserSimulator:
|
89
|
+
|
90
|
+
def __init__(self, user_profile, chatbot):
|
91
|
+
|
92
|
+
self.user_profile = user_profile
|
93
|
+
self.chatbot = chatbot
|
94
|
+
self.temp = user_profile.temperature
|
95
|
+
self.conversation_history = {'interaction': []}
|
96
|
+
self.ask_about = user_profile.ask_about.prompt()
|
97
|
+
self.data_gathering = ChatbotAssistant(user_profile.ask_about.phrases)
|
98
|
+
self.goal_style = user_profile.goal_style
|
99
|
+
self.test_name = user_profile.test_name
|
100
|
+
self.repeat_count = 0
|
101
|
+
self.loop_count = 0
|
102
|
+
self.interaction_count = 0
|
103
|
+
self.user_chain = UserChain(self.user_profile.role, self.temp)
|
104
|
+
self.my_context = self.InitialContext()
|
105
|
+
self.output_slots = self.__build_slot_dict()
|
106
|
+
self.error_report = []
|
107
|
+
|
108
|
+
def __build_slot_dict(self):
|
109
|
+
slot_dict = {}
|
110
|
+
output_list = self.user_profile.output
|
111
|
+
for output in output_list:
|
112
|
+
var_name = list(output.keys())[0]
|
113
|
+
slot_dict[var_name] = None
|
114
|
+
return slot_dict
|
115
|
+
|
116
|
+
class InitialContext:
|
117
|
+
def __init__(self):
|
118
|
+
self.original_context = []
|
119
|
+
self.context_list = []
|
120
|
+
|
121
|
+
def initiate_context(self, context):
|
122
|
+
|
123
|
+
default_context = config.default_context
|
124
|
+
|
125
|
+
if isinstance(context, list):
|
126
|
+
self.original_context = context.copy() + default_context.copy()
|
127
|
+
self.context_list = context.copy() + default_context.copy()
|
128
|
+
else:
|
129
|
+
self.original_context = [context] + default_context
|
130
|
+
self.context_list = [context] + default_context
|
131
|
+
|
132
|
+
def add_context(self, new_context):
|
133
|
+
if isinstance(new_context, list):
|
134
|
+
for cont in new_context:
|
135
|
+
self.context_list.append(cont)
|
136
|
+
else:
|
137
|
+
self.context_list.append(new_context)
|
138
|
+
# TODO: add exception to force the user to initiate the context
|
139
|
+
|
140
|
+
def get_context(self):
|
141
|
+
return '. '.join(self.context_list)
|
142
|
+
|
143
|
+
def reset_context(self):
|
144
|
+
self.context_list = self.original_context.copy()
|
145
|
+
|
146
|
+
def repetition_track(self, response, reps=3):
|
147
|
+
|
148
|
+
self.my_context.reset_context()
|
149
|
+
logger.info(f'Context list: {self.my_context.context_list}')
|
150
|
+
|
151
|
+
if nlp_processor(response, self.chatbot.fallback, 0.6):
|
152
|
+
|
153
|
+
self.repeat_count += 1
|
154
|
+
self.loop_count += 1
|
155
|
+
logger.info(f"is fallback. Repeat_count: {self.repeat_count}. Loop count: {self.loop_count}")
|
156
|
+
|
157
|
+
if self.repeat_count >= reps:
|
158
|
+
self.repeat_count = 0
|
159
|
+
change_topic = """
|
160
|
+
Since the assistant is not understanding what you're saying, change the
|
161
|
+
topic to other things to ask about without starting a new conversation
|
162
|
+
"""
|
163
|
+
|
164
|
+
self.my_context.add_context(change_topic)
|
165
|
+
|
166
|
+
else:
|
167
|
+
ask_repetition = """
|
168
|
+
If the assistant asks you to repeat the question, repeat the last question the user
|
169
|
+
said but rephrase it.
|
170
|
+
"""
|
171
|
+
|
172
|
+
self.my_context.add_context(ask_repetition)
|
173
|
+
else:
|
174
|
+
self.repeat_count = 0
|
175
|
+
self.loop_count = 0
|
176
|
+
|
177
|
+
@staticmethod
|
178
|
+
def conversation_ending(response):
|
179
|
+
return nlp_processor(response, "src/testing/user_sim/end_conversation_patterns.yml", 0.5)
|
180
|
+
|
181
|
+
def get_history(self):
|
182
|
+
|
183
|
+
lines = []
|
184
|
+
for inp in self.conversation_history['interaction']:
|
185
|
+
for k, v in inp.items():
|
186
|
+
lines.append(f"{k}: {v}")
|
187
|
+
return "\n".join(lines)
|
188
|
+
|
189
|
+
def update_history(self, role, message):
|
190
|
+
self.conversation_history['interaction'].append({role: message})
|
191
|
+
|
192
|
+
def end_conversation(self, input_msg):
|
193
|
+
|
194
|
+
|
195
|
+
if config.total_cost >= config.limit_cost or config.total_individual_cost >= config.limit_individual_cost:
|
196
|
+
if config.total_cost >= config.limit_cost:
|
197
|
+
config.errors.append({2000: 'Exceeded global cost'})
|
198
|
+
elif config.total_individual_cost >= config.limit_individual_cost:
|
199
|
+
config.errors.append({2001: 'Exceeded conversation specific cost'})
|
200
|
+
|
201
|
+
logger.info('is end')
|
202
|
+
return True
|
203
|
+
|
204
|
+
if self.goal_style[0] == 'steps' or self.goal_style[0] == 'random steps':
|
205
|
+
if self.interaction_count >= self.goal_style[1]:
|
206
|
+
logger.info('is end')
|
207
|
+
return True
|
208
|
+
|
209
|
+
elif self.conversation_ending(input_msg) or self.loop_count >= 9:
|
210
|
+
config.errors.append({1000: 'Exceeded loop Limit'})
|
211
|
+
logger.warning('Loop count surpassed 9 interactions. Ending conversation.')
|
212
|
+
return True
|
213
|
+
|
214
|
+
elif 'all_answered' in self.goal_style[0] or 'default' in self.goal_style[0]:
|
215
|
+
if (self.data_gathering.gathering_register["verification"].all()
|
216
|
+
and self.all_data_collected()
|
217
|
+
or self.goal_style[2] <= self.interaction_count):
|
218
|
+
logger.info(f'limit amount of interactions achieved: {self.goal_style[2]}. Ending conversation.')
|
219
|
+
return True
|
220
|
+
else:
|
221
|
+
return False
|
222
|
+
|
223
|
+
else:
|
224
|
+
return False
|
225
|
+
|
226
|
+
def all_data_collected(self):
|
227
|
+
output_list = self.user_profile.output
|
228
|
+
for output in output_list:
|
229
|
+
var_name = list(output.keys())[0]
|
230
|
+
var_dict = output.get(var_name)
|
231
|
+
if var_name in self.output_slots and self.output_slots[var_name] is not None:
|
232
|
+
continue
|
233
|
+
my_data_extract = DataExtraction(self.conversation_history,
|
234
|
+
var_name,
|
235
|
+
var_dict["type"],
|
236
|
+
var_dict["description"])
|
237
|
+
value = my_data_extract.get_data_extraction()
|
238
|
+
if value[var_name] is None:
|
239
|
+
return False
|
240
|
+
else:
|
241
|
+
self.output_slots[var_name] = value[var_name]
|
242
|
+
return True
|
243
|
+
|
244
|
+
def get_response(self, input_msg):
|
245
|
+
|
246
|
+
self.update_history("Assistant", input_msg)
|
247
|
+
self.data_gathering.add_message(self.conversation_history)
|
248
|
+
|
249
|
+
if self.end_conversation(input_msg):
|
250
|
+
return "exit"
|
251
|
+
|
252
|
+
self.repetition_track(input_msg)
|
253
|
+
|
254
|
+
self.my_context.add_context(self.user_profile.get_language())
|
255
|
+
|
256
|
+
# history = self.get_history()
|
257
|
+
|
258
|
+
user_response = self.user_chain.invoke(self.conversation_history, self.my_context.get_context())
|
259
|
+
|
260
|
+
self.update_history("User", user_response)
|
261
|
+
|
262
|
+
self.interaction_count += 1
|
263
|
+
|
264
|
+
return user_response
|
265
|
+
|
266
|
+
@staticmethod
|
267
|
+
def formatting(role, msg):
|
268
|
+
return [{"role": role, "content": msg}]
|
269
|
+
|
270
|
+
def get_interaction_styles_prompt(self):
|
271
|
+
interaction_style_prompt = []
|
272
|
+
for instance in self.user_profile.interaction_styles:
|
273
|
+
if instance.change_language_flag:
|
274
|
+
pass
|
275
|
+
else:
|
276
|
+
interaction_style_prompt.append(instance.get_prompt())
|
277
|
+
return ''.join(interaction_style_prompt)
|
278
|
+
|
279
|
+
def open_conversation(self, input_msg=None):
|
280
|
+
|
281
|
+
interaction_style_prompt = self.get_interaction_styles_prompt()
|
282
|
+
self.my_context.initiate_context([self.user_profile.context,
|
283
|
+
interaction_style_prompt,
|
284
|
+
self.ask_about])
|
285
|
+
|
286
|
+
language_context = self.user_profile.get_language()
|
287
|
+
self.my_context.add_context(language_context)
|
288
|
+
|
289
|
+
if input_msg:
|
290
|
+
self.update_history("Assistant", input_msg)
|
291
|
+
self.data_gathering.add_message(self.conversation_history)
|
292
|
+
if self.end_conversation(input_msg):
|
293
|
+
return "exit"
|
294
|
+
self.repetition_track(input_msg)
|
295
|
+
|
296
|
+
user_response = self.user_chain.invoke(self.conversation_history, self.my_context.get_context())
|
297
|
+
|
298
|
+
self.update_history("User", user_response)
|
299
|
+
|
300
|
+
self.data_gathering.add_message(self.conversation_history)
|
301
|
+
self.interaction_count += 1
|
302
|
+
return user_response
|
File without changes
|
@@ -0,0 +1,128 @@
|
|
1
|
+
import speech_recognition as sr
|
2
|
+
from pydantic import BaseModel, ValidationError
|
3
|
+
from typing import List, Union, Dict, Optional
|
4
|
+
import time
|
5
|
+
from user_sim.utils.utilities import read_yaml
|
6
|
+
from user_sim.utils.token_cost_calculator import calculate_cost, max_input_tokens_allowed
|
7
|
+
from openai import OpenAI
|
8
|
+
import warnings
|
9
|
+
import pygame
|
10
|
+
import logging
|
11
|
+
logger = logging.getLogger('Info Logger')
|
12
|
+
|
13
|
+
|
14
|
+
pygame.mixer.init()
|
15
|
+
warnings.filterwarnings("ignore", category=FutureWarning, module="whisper")
|
16
|
+
warnings.filterwarnings("ignore", category=RuntimeWarning, module="pydub")
|
17
|
+
client = OpenAI()
|
18
|
+
audio_format = "mp3"
|
19
|
+
|
20
|
+
|
21
|
+
def get_audio_duration(audio):
|
22
|
+
from pydub import AudioSegment
|
23
|
+
import io
|
24
|
+
wav_data = audio.get_wav_data()
|
25
|
+
audio_segment = AudioSegment.from_file(io.BytesIO(wav_data), format="wav")
|
26
|
+
duration_seconds = len(audio_segment) / 1000.0
|
27
|
+
return duration_seconds
|
28
|
+
|
29
|
+
|
30
|
+
class SttModel(BaseModel):
|
31
|
+
energy_threshold: float = 50
|
32
|
+
pause_threshold: float = 1
|
33
|
+
|
34
|
+
|
35
|
+
class TtsModel(BaseModel):
|
36
|
+
model: str = "tts-1"
|
37
|
+
voice: str = "alloy"
|
38
|
+
speed: float = 1.0
|
39
|
+
|
40
|
+
|
41
|
+
class SpeechModel(BaseModel):
|
42
|
+
stt: Optional[SttModel] = SttModel
|
43
|
+
tts: Optional[TtsModel] = TtsModel
|
44
|
+
|
45
|
+
|
46
|
+
class STTModule:
|
47
|
+
|
48
|
+
def __init__(self, config):
|
49
|
+
|
50
|
+
if config:
|
51
|
+
config_file = read_yaml(config)
|
52
|
+
try:
|
53
|
+
validated_data = SpeechModel(**config_file)
|
54
|
+
except ValidationError as e:
|
55
|
+
print(e.json())
|
56
|
+
raise
|
57
|
+
|
58
|
+
#STT
|
59
|
+
self.energy_th = validated_data.stt.energy_threshold
|
60
|
+
self.pause_th = validated_data.stt.pause_threshold
|
61
|
+
|
62
|
+
#TTS
|
63
|
+
self.model = validated_data.tts.model
|
64
|
+
self.voice = validated_data.tts.voice
|
65
|
+
self.speed = validated_data.tts.speed
|
66
|
+
|
67
|
+
else:
|
68
|
+
# STT
|
69
|
+
self.energy_th = 50
|
70
|
+
self.pause_th = 1
|
71
|
+
|
72
|
+
# TTS
|
73
|
+
self.model = "tts-1"
|
74
|
+
self.voice = "alloy"
|
75
|
+
self.speed = 1.0
|
76
|
+
|
77
|
+
def hear(self):
|
78
|
+
r = sr.Recognizer()
|
79
|
+
with sr.Microphone() as source:
|
80
|
+
logger.info("Listening...")
|
81
|
+
r.energy_threshold = self.energy_th # lower values for quieter rooms
|
82
|
+
r.pause_threshold = self.pause_th
|
83
|
+
audio = r.listen(source)
|
84
|
+
|
85
|
+
try:
|
86
|
+
logger.info("Recognizing...")
|
87
|
+
start = time.time()
|
88
|
+
audio_length = get_audio_duration(audio)
|
89
|
+
if max_input_tokens_allowed(model_used="whisper", audio_length=audio_length):
|
90
|
+
logger.error(f"Token limit was surpassed")
|
91
|
+
return False, None
|
92
|
+
|
93
|
+
text = r.recognize_whisper(audio)
|
94
|
+
end = time.time()
|
95
|
+
|
96
|
+
calculate_cost(model="whisper", module="stt_module", audio_length=audio_length)
|
97
|
+
logger.info(f"Recognition time: {end - start}")
|
98
|
+
return True, text
|
99
|
+
except sr.UnknownValueError:
|
100
|
+
logger.warning("Recognition model could not understand audio")
|
101
|
+
return True, "Repeat, please."
|
102
|
+
except sr.RequestError as e:
|
103
|
+
logger.warning("Could not request results from Speech Recognition service; {0}".format(e))
|
104
|
+
return False, None
|
105
|
+
|
106
|
+
|
107
|
+
def say(self, message):
|
108
|
+
|
109
|
+
if max_input_tokens_allowed(message, self.model):
|
110
|
+
return
|
111
|
+
|
112
|
+
with client.audio.speech.with_streaming_response.create(
|
113
|
+
model=self.model,
|
114
|
+
voice=self.voice,
|
115
|
+
speed=self.speed,
|
116
|
+
input=message,
|
117
|
+
response_format=audio_format
|
118
|
+
) as response:
|
119
|
+
response.stream_to_file("data/audio_files/output." + audio_format)
|
120
|
+
|
121
|
+
calculate_cost(message, model=self.model, module="tts_module")
|
122
|
+
logger.info("Playing...")
|
123
|
+
audio_path = f"audio_files/output.{audio_format}"
|
124
|
+
with open(audio_path, 'rb') as audio_file:
|
125
|
+
pygame.mixer.music.load(audio_file)
|
126
|
+
pygame.mixer.music.play()
|
127
|
+
while pygame.mixer.music.get_busy():
|
128
|
+
pygame.time.Clock().tick(10)
|
@@ -0,0 +1,202 @@
|
|
1
|
+
import requests
|
2
|
+
import platform
|
3
|
+
import shutil
|
4
|
+
import logging
|
5
|
+
from bs4 import BeautifulSoup
|
6
|
+
from urllib.parse import urljoin
|
7
|
+
from selenium import webdriver
|
8
|
+
from selenium.webdriver.chrome.options import Options
|
9
|
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
10
|
+
from selenium.webdriver.edge.service import Service as EdgeService
|
11
|
+
from selenium.webdriver.firefox.service import Service as FirefoxService
|
12
|
+
from webdriver_manager.chrome import ChromeDriverManager
|
13
|
+
from webdriver_manager.microsoft import EdgeChromiumDriverManager
|
14
|
+
from webdriver_manager.firefox import GeckoDriverManager
|
15
|
+
from user_sim.handlers.image_recognition_module import image_description
|
16
|
+
from user_sim.utils import config
|
17
|
+
from user_sim.utils.register_management import save_register, load_register, hash_generate
|
18
|
+
|
19
|
+
|
20
|
+
logger = logging.getLogger('Info Logger')
|
21
|
+
|
22
|
+
wp_register_name = "webpage_register.json"
|
23
|
+
|
24
|
+
|
25
|
+
def is_driver_installed(driver_name):
|
26
|
+
return shutil.which(driver_name) is not None
|
27
|
+
|
28
|
+
|
29
|
+
def get_webdriver():
|
30
|
+
system = platform.system()
|
31
|
+
|
32
|
+
if system == "Windows":
|
33
|
+
print("Using Microsoft Edge on Windows")
|
34
|
+
if is_driver_installed("msedgedriver"):
|
35
|
+
return webdriver.Edge()
|
36
|
+
service = EdgeService(EdgeChromiumDriverManager().install())
|
37
|
+
return webdriver.Edge(service=service)
|
38
|
+
|
39
|
+
elif system == "Darwin": # MacOS
|
40
|
+
print("Using Safari on Mac")
|
41
|
+
return webdriver.Safari()
|
42
|
+
|
43
|
+
elif system == "Linux":
|
44
|
+
print("Using Firefox on Linux")
|
45
|
+
if is_driver_installed("geckodriver"):
|
46
|
+
return webdriver.Firefox()
|
47
|
+
service = FirefoxService(GeckoDriverManager().install())
|
48
|
+
return webdriver.Firefox(service=service)
|
49
|
+
|
50
|
+
else:
|
51
|
+
print("Unknown system. Trying Chrome...")
|
52
|
+
if is_driver_installed("chromedriver"):
|
53
|
+
return webdriver.Chrome()
|
54
|
+
service = ChromeService(ChromeDriverManager().install())
|
55
|
+
return webdriver.Chrome(service=service)
|
56
|
+
|
57
|
+
|
58
|
+
|
59
|
+
|
60
|
+
def is_dynamic_page(url):
|
61
|
+
headers = {"User-Agent": "Mozilla/5.0"} # Simulate a real browser
|
62
|
+
response = requests.get(url, headers=headers)
|
63
|
+
|
64
|
+
if response.status_code != 200:
|
65
|
+
return False # If the page does not respond, assume it's not JS
|
66
|
+
|
67
|
+
# Check content type
|
68
|
+
content_type = response.headers.get("Content-Type", "")
|
69
|
+
if "text/html" not in content_type:
|
70
|
+
logger.warning(f"Expected HTML, but got {content_type}")
|
71
|
+
return False
|
72
|
+
|
73
|
+
try:
|
74
|
+
soup = BeautifulSoup(response.text, "html.parser")
|
75
|
+
# Check if there is little visible content
|
76
|
+
text = soup.get_text(strip=True)
|
77
|
+
|
78
|
+
if len(text) < 200: # Adjust threshold based on page type
|
79
|
+
return True # Probably uses JavaScript
|
80
|
+
|
81
|
+
except Exception as e:
|
82
|
+
logger.error(f"Error parsing HTML: {e}")
|
83
|
+
# todo: add error code
|
84
|
+
return False
|
85
|
+
|
86
|
+
return False # The page has enough static content
|
87
|
+
|
88
|
+
|
89
|
+
def uses_ajax(url):
|
90
|
+
response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
|
91
|
+
|
92
|
+
if response.status_code != 200:
|
93
|
+
return False
|
94
|
+
|
95
|
+
return "fetch" in response.text or "XMLHttpRequest" in response.text
|
96
|
+
|
97
|
+
|
98
|
+
def detect_scraping_method(url):
|
99
|
+
if is_dynamic_page(url) or uses_ajax(url):
|
100
|
+
return "selenium"
|
101
|
+
return "requests"
|
102
|
+
|
103
|
+
|
104
|
+
def describe_images_in_webpage(url, soup):
|
105
|
+
image_descriptions = []
|
106
|
+
for img_index, img in enumerate(soup.find_all("img")):
|
107
|
+
src = img.get("src")
|
108
|
+
if src:
|
109
|
+
full_url = urljoin(url, src)
|
110
|
+
description = image_description(full_url, detailed=False)
|
111
|
+
image_descriptions.append(f"Image description {img_index}: {description}")
|
112
|
+
|
113
|
+
image_text = " ".join(image_descriptions)
|
114
|
+
return image_text
|
115
|
+
|
116
|
+
|
117
|
+
|
118
|
+
def webpage_reader(url):
|
119
|
+
|
120
|
+
if config.ignore_cache:
|
121
|
+
register = {}
|
122
|
+
logger.info("Cache will be ignored.")
|
123
|
+
else:
|
124
|
+
register = load_register(wp_register_name)
|
125
|
+
|
126
|
+
wp_hash = hash_generate(content=url)
|
127
|
+
|
128
|
+
def process_html(url):
|
129
|
+
method = detect_scraping_method(url)
|
130
|
+
|
131
|
+
if method == "selenium":
|
132
|
+
chrome_options = Options()
|
133
|
+
chrome_options.add_argument("--headless")
|
134
|
+
driver = get_webdriver()
|
135
|
+
|
136
|
+
driver.get(url)
|
137
|
+
page_source = driver.page_source
|
138
|
+
driver.quit()
|
139
|
+
|
140
|
+
try:
|
141
|
+
soup = BeautifulSoup(page_source, "lxml")
|
142
|
+
except Exception as e:
|
143
|
+
logger.error(f"Error parsing JavaScript-rendered HTML: {e}")
|
144
|
+
return None
|
145
|
+
|
146
|
+
else:
|
147
|
+
headers = {"User-Agent": "Mozilla/5.0"}
|
148
|
+
response = requests.get(url, headers=headers)
|
149
|
+
|
150
|
+
if response.status_code != 200:
|
151
|
+
logger.error(f"Error accessing the page: {response.status_code}")
|
152
|
+
return None
|
153
|
+
|
154
|
+
# Ensure proper encoding
|
155
|
+
response.encoding = response.apparent_encoding
|
156
|
+
|
157
|
+
# Check if the response is actually HTML
|
158
|
+
content_type = response.headers.get("Content-Type", "")
|
159
|
+
if "text/html" not in content_type:
|
160
|
+
logger.warning(f"Expected HTML, but got {content_type}")
|
161
|
+
return None
|
162
|
+
|
163
|
+
try:
|
164
|
+
soup = BeautifulSoup(response.text, "lxml") # More robust parser
|
165
|
+
except Exception as e:
|
166
|
+
logger.error(f"Error parsing HTML: {e}")
|
167
|
+
return None
|
168
|
+
|
169
|
+
# Remove unnecessary elements
|
170
|
+
for script in soup(["script", "style", "header", "footer", "nav", "aside"]):
|
171
|
+
script.extract()
|
172
|
+
|
173
|
+
text = soup.get_text(separator=" ", strip=True)
|
174
|
+
images = describe_images_in_webpage(url, soup)
|
175
|
+
description = f"(Web page content: {text + images} >>)"
|
176
|
+
return description
|
177
|
+
|
178
|
+
if wp_hash in register:
|
179
|
+
if config.update_cache:
|
180
|
+
output_text = process_html(url)
|
181
|
+
if output_text is None:
|
182
|
+
logger.error("Cache couldn't be updated due to web page error.")
|
183
|
+
return f"(Web page content: web page couldn't be loaded.)"
|
184
|
+
register[wp_hash] = output_text
|
185
|
+
logger.info("Cache updated!")
|
186
|
+
output_text = register[wp_hash]
|
187
|
+
logger.info("Retrieved information from cache.")
|
188
|
+
else:
|
189
|
+
output_text = process_html(url)
|
190
|
+
if output_text:
|
191
|
+
register[wp_hash] = output_text
|
192
|
+
else:
|
193
|
+
output_text = f"(Web page content: web page couldn't be loaded.)"
|
194
|
+
|
195
|
+
if config.ignore_cache:
|
196
|
+
logger.info("PDF cache was ignored.")
|
197
|
+
else:
|
198
|
+
save_register(register, wp_register_name)
|
199
|
+
logger.info("PDF cache was saved!")
|
200
|
+
|
201
|
+
logger.info(output_text)
|
202
|
+
return output_text
|