uk_bin_collection 0.153.0__py3-none-any.whl → 0.157.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- uk_bin_collection/tests/input.json +34 -25
- uk_bin_collection/uk_bin_collection/councils/AberdeenCityCouncil.py +0 -1
- uk_bin_collection/uk_bin_collection/councils/BCPCouncil.py +45 -120
- uk_bin_collection/uk_bin_collection/councils/BasingstokeCouncil.py +4 -1
- uk_bin_collection/uk_bin_collection/councils/BrightonandHoveCityCouncil.py +15 -36
- uk_bin_collection/uk_bin_collection/councils/CastlepointDistrictCouncil.py +55 -24
- uk_bin_collection/uk_bin_collection/councils/DacorumBoroughCouncil.py +22 -13
- uk_bin_collection/uk_bin_collection/councils/EastDunbartonshireCouncil.py +52 -0
- uk_bin_collection/uk_bin_collection/councils/ErewashBoroughCouncil.py +32 -34
- uk_bin_collection/uk_bin_collection/councils/FarehamBoroughCouncil.py +5 -2
- uk_bin_collection/uk_bin_collection/councils/FolkstoneandHytheDistrictCouncil.py +22 -0
- uk_bin_collection/uk_bin_collection/councils/GlasgowCityCouncil.py +1 -1
- uk_bin_collection/uk_bin_collection/councils/HartlepoolBoroughCouncil.py +3 -1
- uk_bin_collection/uk_bin_collection/councils/IslingtonCouncil.py +8 -5
- uk_bin_collection/uk_bin_collection/councils/LancasterCityCouncil.py +23 -10
- uk_bin_collection/uk_bin_collection/councils/MidSuffolkDistrictCouncil.py +70 -92
- uk_bin_collection/uk_bin_collection/councils/NewForestCouncil.py +104 -47
- uk_bin_collection/uk_bin_collection/councils/NewportCityCouncil.py +138 -21
- uk_bin_collection/uk_bin_collection/councils/NorthHertfordshireDistrictCouncil.py +26 -128
- uk_bin_collection/uk_bin_collection/councils/NorthumberlandCouncil.py +245 -82
- uk_bin_collection/uk_bin_collection/councils/OxfordCityCouncil.py +1 -0
- uk_bin_collection/uk_bin_collection/councils/RenfrewshireCouncil.py +170 -13
- uk_bin_collection/uk_bin_collection/councils/RotherhamCouncil.py +70 -38
- uk_bin_collection/uk_bin_collection/councils/RushmoorCouncil.py +4 -2
- uk_bin_collection/uk_bin_collection/councils/SandwellBoroughCouncil.py +4 -11
- uk_bin_collection/uk_bin_collection/councils/SloughBoroughCouncil.py +39 -21
- uk_bin_collection/uk_bin_collection/councils/SomersetCouncil.py +136 -21
- uk_bin_collection/uk_bin_collection/councils/SouthGloucestershireCouncil.py +18 -22
- uk_bin_collection/uk_bin_collection/councils/TestValleyBoroughCouncil.py +138 -21
- uk_bin_collection/uk_bin_collection/councils/WestBerkshireCouncil.py +16 -13
- {uk_bin_collection-0.153.0.dist-info → uk_bin_collection-0.157.0.dist-info}/METADATA +1 -1
- {uk_bin_collection-0.153.0.dist-info → uk_bin_collection-0.157.0.dist-info}/RECORD +35 -34
- {uk_bin_collection-0.153.0.dist-info → uk_bin_collection-0.157.0.dist-info}/LICENSE +0 -0
- {uk_bin_collection-0.153.0.dist-info → uk_bin_collection-0.157.0.dist-info}/WHEEL +0 -0
- {uk_bin_collection-0.153.0.dist-info → uk_bin_collection-0.157.0.dist-info}/entry_points.txt +0 -0
@@ -1,17 +1,17 @@
|
|
1
1
|
import time
|
2
|
+
import datetime
|
2
3
|
|
4
|
+
from datetime import datetime
|
3
5
|
from bs4 import BeautifulSoup
|
4
6
|
from selenium.common.exceptions import TimeoutException
|
5
7
|
from selenium.webdriver.common.by import By
|
8
|
+
from selenium.webdriver.common.keys import Keys
|
6
9
|
from selenium.webdriver.support import expected_conditions as EC
|
7
|
-
from selenium.webdriver.support.ui import WebDriverWait
|
10
|
+
from selenium.webdriver.support.ui import Select, WebDriverWait
|
8
11
|
|
9
12
|
from uk_bin_collection.uk_bin_collection.common import *
|
10
13
|
from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass
|
11
14
|
|
12
|
-
# import the wonderful Beautiful Soup and the URL grabber
|
13
|
-
|
14
|
-
|
15
15
|
class CouncilClass(AbstractGetBinDataClass):
|
16
16
|
"""
|
17
17
|
Concrete classes have to implement all abstract operations of the
|
@@ -30,16 +30,23 @@ class CouncilClass(AbstractGetBinDataClass):
|
|
30
30
|
def parse_data(self, page: str, **kwargs) -> dict:
|
31
31
|
driver = None
|
32
32
|
try:
|
33
|
-
|
33
|
+
<<<<<<< HEAD
|
34
|
+
# Use the new URL as mentioned in the issue
|
35
|
+
page = "http://bincollection.northumberland.gov.uk"
|
36
|
+
=======
|
37
|
+
page = "https://bincollection.northumberland.gov.uk/postcode"
|
38
|
+
>>>>>>> master
|
34
39
|
|
35
40
|
data = {"bins": []}
|
36
41
|
|
37
|
-
user_paon = kwargs.get("paon")
|
38
42
|
user_postcode = kwargs.get("postcode")
|
43
|
+
user_uprn = kwargs.get("uprn")
|
44
|
+
|
45
|
+
check_postcode(user_postcode)
|
46
|
+
check_uprn(user_uprn)
|
47
|
+
|
39
48
|
web_driver = kwargs.get("web_driver")
|
40
49
|
headless = kwargs.get("headless")
|
41
|
-
check_paon(user_paon)
|
42
|
-
check_postcode(user_postcode)
|
43
50
|
|
44
51
|
# Create Selenium webdriver
|
45
52
|
driver = create_webdriver(web_driver, headless, None, __name__)
|
@@ -48,114 +55,270 @@ class CouncilClass(AbstractGetBinDataClass):
|
|
48
55
|
# Create wait object
|
49
56
|
wait = WebDriverWait(driver, 20)
|
50
57
|
|
58
|
+
<<<<<<< HEAD
|
59
|
+
# The new site may have different structure, so we'll need to adapt
|
60
|
+
# Try to find postcode and house number inputs
|
61
|
+
try:
|
62
|
+
# Look for postcode input field
|
63
|
+
postcode_input = wait.until(
|
64
|
+
EC.presence_of_element_located(
|
65
|
+
(By.XPATH, "//input[contains(@name, 'postcode') or contains(@id, 'postcode') or contains(@placeholder, 'postcode')]")
|
66
|
+
)
|
67
|
+
)
|
68
|
+
|
69
|
+
# Look for house number input field
|
70
|
+
house_input = wait.until(
|
71
|
+
EC.presence_of_element_located(
|
72
|
+
(By.XPATH, "//input[contains(@name, 'house') or contains(@id, 'house') or contains(@name, 'number') or contains(@placeholder, 'house')]")
|
73
|
+
)
|
74
|
+
)
|
75
|
+
|
76
|
+
# Enter details
|
77
|
+
postcode_input.send_keys(user_postcode)
|
78
|
+
house_input.send_keys(user_paon)
|
79
|
+
|
80
|
+
# Look for submit button
|
81
|
+
submit_button = wait.until(
|
82
|
+
EC.element_to_be_clickable(
|
83
|
+
(By.XPATH, "//button[@type='submit'] | //input[@type='submit'] | //button[contains(text(), 'Search')] | //input[contains(@value, 'Search')]")
|
84
|
+
)
|
85
|
+
)
|
86
|
+
submit_button.click()
|
87
|
+
|
88
|
+
# Wait for results to load
|
89
|
+
time.sleep(3)
|
90
|
+
|
91
|
+
# Get page source after everything has loaded
|
92
|
+
soup = BeautifulSoup(driver.page_source, features="html.parser")
|
93
|
+
|
94
|
+
# Look for collection dates and bin types in the results
|
95
|
+
# This is a generic approach that looks for common patterns
|
96
|
+
import re
|
97
|
+
from datetime import datetime
|
98
|
+
|
99
|
+
# Look for date patterns in the page
|
100
|
+
date_pattern = r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\s+\d{2,4}\b'
|
101
|
+
page_text = soup.get_text()
|
102
|
+
dates = re.findall(date_pattern, page_text, re.IGNORECASE)
|
103
|
+
|
104
|
+
# Look for bin type keywords near dates
|
105
|
+
bin_keywords = ['recycling', 'refuse', 'garden', 'waste', 'rubbish', 'general', 'household']
|
106
|
+
|
107
|
+
# Try to extract structured data from tables or lists
|
108
|
+
tables = soup.find_all('table')
|
109
|
+
for table in tables:
|
110
|
+
rows = table.find_all('tr')
|
111
|
+
for row in rows:
|
112
|
+
cells = row.find_all(['td', 'th'])
|
113
|
+
if len(cells) >= 2:
|
114
|
+
# Look for date in first cell and bin type in second
|
115
|
+
date_text = cells[0].get_text().strip()
|
116
|
+
type_text = cells[1].get_text().strip()
|
117
|
+
|
118
|
+
# Try to parse date
|
119
|
+
try:
|
120
|
+
if re.match(r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}', date_text):
|
121
|
+
date_obj = datetime.strptime(date_text, '%d/%m/%Y')
|
122
|
+
elif re.match(r'\d{1,2}\s+\w+\s+\d{4}', date_text):
|
123
|
+
date_obj = datetime.strptime(date_text, '%d %B %Y')
|
124
|
+
else:
|
125
|
+
continue
|
126
|
+
|
127
|
+
if any(keyword in type_text.lower() for keyword in bin_keywords):
|
128
|
+
data["bins"].append({
|
129
|
+
"type": type_text,
|
130
|
+
"collectionDate": date_obj.strftime(date_format)
|
131
|
+
})
|
132
|
+
except ValueError:
|
133
|
+
continue
|
134
|
+
|
135
|
+
except TimeoutException:
|
136
|
+
# If the new site structure is completely different, fall back to old URL
|
137
|
+
driver.get("https://www.northumberland.gov.uk/Waste/Household-waste/Household-bin-collections/Bin-Calendars.aspx")
|
138
|
+
|
139
|
+
# Wait for and click cookie button if present
|
140
|
+
try:
|
141
|
+
cookie_button = wait.until(
|
142
|
+
EC.element_to_be_clickable((By.ID, "ccc-notify-accept"))
|
143
|
+
)
|
144
|
+
cookie_button.click()
|
145
|
+
except TimeoutException:
|
146
|
+
pass
|
147
|
+
|
148
|
+
# Continue with original logic for old site
|
149
|
+
inputElement_hn = wait.until(
|
150
|
+
EC.presence_of_element_located(
|
151
|
+
(
|
152
|
+
By.ID,
|
153
|
+
"p_lt_ctl04_pageplaceholder_p_lt_ctl02_WasteCollectionCalendars_NCCAddressLookup_txtHouse",
|
154
|
+
)
|
155
|
+
)
|
156
|
+
)
|
157
|
+
|
158
|
+
inputElement_pc = wait.until(
|
159
|
+
EC.presence_of_element_located(
|
160
|
+
(
|
161
|
+
By.ID,
|
162
|
+
"p_lt_ctl04_pageplaceholder_p_lt_ctl02_WasteCollectionCalendars_NCCAddressLookup_txtPostcode",
|
163
|
+
)
|
164
|
+
)
|
165
|
+
)
|
166
|
+
|
167
|
+
inputElement_pc.send_keys(user_postcode)
|
168
|
+
inputElement_hn.send_keys(user_paon)
|
169
|
+
|
170
|
+
lookup_button = wait.until(
|
171
|
+
EC.element_to_be_clickable(
|
172
|
+
(
|
173
|
+
By.ID,
|
174
|
+
"p_lt_ctl04_pageplaceholder_p_lt_ctl02_WasteCollectionCalendars_NCCAddressLookup_butLookup",
|
175
|
+
)
|
176
|
+
)
|
177
|
+
)
|
178
|
+
lookup_button.click()
|
179
|
+
|
180
|
+
route_summary = wait.until(
|
181
|
+
EC.presence_of_element_located(
|
182
|
+
(
|
183
|
+
By.ID,
|
184
|
+
"p_lt_ctl04_pageplaceholder_p_lt_ctl02_WasteCollectionCalendars_spanRouteSummary",
|
185
|
+
)
|
186
|
+
)
|
187
|
+
)
|
188
|
+
|
189
|
+
soup = BeautifulSoup(driver.page_source, features="html.parser")
|
190
|
+
|
191
|
+
bins_collected = list(
|
192
|
+
map(
|
193
|
+
str.strip,
|
194
|
+
soup.find(
|
195
|
+
"span",
|
196
|
+
id="p_lt_ctl04_pageplaceholder_p_lt_ctl02_WasteCollectionCalendars_spanRouteSummary",
|
197
|
+
)
|
198
|
+
.text.replace("Routes found: ", "")
|
199
|
+
.split(","),
|
200
|
+
)
|
201
|
+
)
|
202
|
+
|
203
|
+
bins_by_colours = dict()
|
204
|
+
for bin in bins_collected:
|
205
|
+
if "(but no dates found)" in bin:
|
206
|
+
continue
|
207
|
+
style_str = soup.find("span", string=bin)["style"]
|
208
|
+
bin_colour = self.extract_styles(style_str)["background-color"].upper()
|
209
|
+
bins_by_colours[bin_colour] = bin
|
210
|
+
|
211
|
+
calander_tables = soup.find_all("table", title="Calendar")
|
212
|
+
for table in calander_tables:
|
213
|
+
rows = table.find_all("tr")
|
214
|
+
month_and_year = (
|
215
|
+
rows[0].find("table", class_="calCtrlTitle").find("td").string
|
216
|
+
)
|
217
|
+
bin_days = table.find_all("td", class_="calCtrlDay")
|
218
|
+
for day in bin_days:
|
219
|
+
day_styles = self.extract_styles(day["style"])
|
220
|
+
if "background-color" in day_styles:
|
221
|
+
colour = day_styles["background-color"].upper()
|
222
|
+
date = time.strptime(
|
223
|
+
f"{day.string} {month_and_year}", "%d %B %Y"
|
224
|
+
)
|
225
|
+
|
226
|
+
data["bins"].append(
|
227
|
+
{
|
228
|
+
"type": bins_by_colours[colour],
|
229
|
+
"collectionDate": time.strftime(date_format, date),
|
230
|
+
}
|
231
|
+
)
|
232
|
+
|
233
|
+
=======
|
51
234
|
# Wait for and click cookie button
|
52
235
|
cookie_button = wait.until(
|
53
|
-
EC.element_to_be_clickable(
|
236
|
+
EC.element_to_be_clickable(
|
237
|
+
(By.CLASS_NAME, "accept-all")
|
238
|
+
)
|
54
239
|
)
|
55
240
|
cookie_button.click()
|
56
241
|
|
57
|
-
# Wait for and find
|
58
|
-
|
242
|
+
# Wait for and find postcode input
|
243
|
+
inputElement_pc = wait.until(
|
59
244
|
EC.presence_of_element_located(
|
60
|
-
(
|
61
|
-
By.ID,
|
62
|
-
"p_lt_ctl04_pageplaceholder_p_lt_ctl02_WasteCollectionCalendars_NCCAddressLookup_txtHouse",
|
63
|
-
)
|
245
|
+
(By.ID, "postcode")
|
64
246
|
)
|
65
247
|
)
|
66
248
|
|
67
|
-
#
|
68
|
-
inputElement_pc
|
249
|
+
# Enter postcode and submit
|
250
|
+
inputElement_pc.send_keys(user_postcode)
|
251
|
+
inputElement_pc.send_keys(Keys.ENTER)
|
252
|
+
|
253
|
+
# Wait for and find house number input
|
254
|
+
selectElement_address = wait.until(
|
69
255
|
EC.presence_of_element_located(
|
70
|
-
(
|
71
|
-
By.ID,
|
72
|
-
"p_lt_ctl04_pageplaceholder_p_lt_ctl02_WasteCollectionCalendars_NCCAddressLookup_txtPostcode",
|
73
|
-
)
|
256
|
+
(By.ID, "address")
|
74
257
|
)
|
75
258
|
)
|
76
259
|
|
77
|
-
|
78
|
-
|
79
|
-
inputElement_hn.send_keys(user_paon)
|
260
|
+
dropdown = Select(selectElement_address)
|
261
|
+
dropdown.select_by_value(user_uprn)
|
80
262
|
|
81
|
-
# Click
|
82
|
-
|
263
|
+
# Click submit button and wait for results
|
264
|
+
submit_button = wait.until(
|
83
265
|
EC.element_to_be_clickable(
|
84
|
-
(
|
85
|
-
By.ID,
|
86
|
-
"p_lt_ctl04_pageplaceholder_p_lt_ctl02_WasteCollectionCalendars_NCCAddressLookup_butLookup",
|
87
|
-
)
|
266
|
+
(By.CLASS_NAME, "govuk-button")
|
88
267
|
)
|
89
268
|
)
|
90
|
-
|
269
|
+
submit_button.click()
|
91
270
|
|
92
271
|
# Wait for results to load
|
93
272
|
route_summary = wait.until(
|
94
273
|
EC.presence_of_element_located(
|
95
|
-
(
|
96
|
-
By.ID,
|
97
|
-
"p_lt_ctl04_pageplaceholder_p_lt_ctl02_WasteCollectionCalendars_spanRouteSummary",
|
98
|
-
)
|
274
|
+
(By.CLASS_NAME, "govuk-table")
|
99
275
|
)
|
100
276
|
)
|
101
277
|
|
278
|
+
now = datetime.now()
|
279
|
+
current_month = now.month
|
280
|
+
current_year = now.year
|
281
|
+
|
102
282
|
# Get page source after everything has loaded
|
103
283
|
soup = BeautifulSoup(driver.page_source, features="html.parser")
|
104
284
|
|
105
|
-
#
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
.text.replace("Routes found: ", "")
|
114
|
-
.split(","),
|
115
|
-
)
|
116
|
-
)
|
285
|
+
# From the table, find all rows:
|
286
|
+
# - cell 1 is the date in format eg. 9 September (so no year value 🥲)
|
287
|
+
# - cell 2 is the day name, not useful
|
288
|
+
# - cell 3 is the bin type eg. "General waste", "Recycling", "Garden waste"
|
289
|
+
rows = soup.find("tbody", class_="govuk-table__body").find_all("tr", class_="govuk-table__row")
|
290
|
+
|
291
|
+
for row in rows:
|
292
|
+
bin_type=row.find_all("td")[-1].text.strip()
|
117
293
|
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
rows = table.find_all("tr")
|
133
|
-
month_and_year = (
|
134
|
-
rows[0].find("table", class_="calCtrlTitle").find("td").string
|
294
|
+
collection_date_string = row.find('th').text.strip()
|
295
|
+
|
296
|
+
# sometimes but not always the day is written "22nd" instead of 22 so make sure we get a proper int
|
297
|
+
collection_date_day = "".join([i for i in list(collection_date_string.split(" ")[0]) if i.isdigit()])
|
298
|
+
collection_date_month_name = collection_date_string.split(" ")[1]
|
299
|
+
|
300
|
+
# if we are currently in Oct, Nov, or Dec and the collection month is Jan, Feb, or Mar, let's assume its next year
|
301
|
+
if (current_month >= 10) and (collection_date_month_name in ["January", "February", "March"]):
|
302
|
+
collection_date_year = current_year + 1
|
303
|
+
else:
|
304
|
+
collection_date_year = current_year
|
305
|
+
|
306
|
+
collection_date = time.strptime(
|
307
|
+
f"{collection_date_day} {collection_date_month_name} {collection_date_year}", "%d %B %Y"
|
135
308
|
)
|
136
|
-
bin_days = table.find_all("td", class_="calCtrlDay")
|
137
|
-
for day in bin_days:
|
138
|
-
day_styles = self.extract_styles(day["style"])
|
139
|
-
if "background-color" in day_styles:
|
140
|
-
colour = day_styles["background-color"].upper()
|
141
|
-
date = time.strptime(
|
142
|
-
f"{day.string} {month_and_year}", "%d %B %Y"
|
143
|
-
)
|
144
309
|
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
310
|
+
# Add it to the data
|
311
|
+
data["bins"].append(
|
312
|
+
{
|
313
|
+
"type": bin_type,
|
314
|
+
"collectionDate": time.strftime(date_format, collection_date),
|
315
|
+
}
|
316
|
+
)
|
317
|
+
>>>>>>> master
|
152
318
|
except Exception as e:
|
153
|
-
# Here you can log the exception if needed
|
154
319
|
print(f"An error occurred: {e}")
|
155
|
-
# Optionally, re-raise the exception if you want it to propagate
|
156
320
|
raise
|
157
321
|
finally:
|
158
|
-
# This block ensures that the driver is closed regardless of an exception
|
159
322
|
if driver:
|
160
323
|
driver.quit()
|
161
324
|
return data
|
@@ -25,6 +25,7 @@ class CouncilClass(AbstractGetBinDataClass):
|
|
25
25
|
URI = "https://www.oxford.gov.uk/xfp/form/142#q6ad4e3bf432c83230a0347a6eea6c805c672efeb_0"
|
26
26
|
|
27
27
|
session = requests.Session()
|
28
|
+
session.headers.update({'User-Agent': 'HomeAssistant UK Bin Collection integration'})
|
28
29
|
token_response = session.get(session_uri)
|
29
30
|
soup = BeautifulSoup(token_response.text, "html.parser")
|
30
31
|
token = soup.find("input", {"name": "__token"}).attrs["value"]
|
@@ -1,3 +1,4 @@
|
|
1
|
+
import time
|
1
2
|
from bs4 import BeautifulSoup
|
2
3
|
from selenium.webdriver.common.by import By
|
3
4
|
from selenium.webdriver.support import expected_conditions as EC
|
@@ -27,14 +28,26 @@ class CouncilClass(AbstractGetBinDataClass):
|
|
27
28
|
check_paon(user_paon)
|
28
29
|
check_postcode(user_postcode)
|
29
30
|
|
30
|
-
# Create Selenium webdriver
|
31
|
-
|
31
|
+
# Create Selenium webdriver with user agent to bypass Cloudflare
|
32
|
+
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
33
|
+
driver = create_webdriver(web_driver, headless, user_agent, __name__)
|
32
34
|
driver.get("https://www.renfrewshire.gov.uk/bin-day")
|
33
35
|
|
34
|
-
|
35
|
-
|
36
|
+
# Wait for initial page load and Cloudflare bypass
|
37
|
+
WebDriverWait(driver, 30).until(
|
38
|
+
lambda d: "Just a moment" not in d.title and d.title != ""
|
36
39
|
)
|
37
|
-
|
40
|
+
time.sleep(3)
|
41
|
+
|
42
|
+
# Try to accept cookies if the banner appears
|
43
|
+
try:
|
44
|
+
accept_button = WebDriverWait(driver, 10).until(
|
45
|
+
EC.element_to_be_clickable((By.ID, "ccc-notify-accept"))
|
46
|
+
)
|
47
|
+
accept_button.click()
|
48
|
+
time.sleep(2)
|
49
|
+
except:
|
50
|
+
pass
|
38
51
|
|
39
52
|
# Wait for the postcode field to appear then populate it
|
40
53
|
inputElement_postcode = WebDriverWait(driver, 30).until(
|
@@ -64,23 +77,167 @@ class CouncilClass(AbstractGetBinDataClass):
|
|
64
77
|
)
|
65
78
|
).click()
|
66
79
|
|
67
|
-
#
|
68
|
-
|
69
|
-
|
70
|
-
|
80
|
+
# Handle Cloudflare challenge that appears after address selection
|
81
|
+
# Wait for page to potentially show Cloudflare challenge
|
82
|
+
time.sleep(3)
|
83
|
+
|
84
|
+
# Check if we hit a Cloudflare challenge
|
85
|
+
if "Just a moment" in driver.page_source or "Verify you are human" in driver.page_source:
|
86
|
+
print("Cloudflare challenge detected, trying to bypass...")
|
87
|
+
|
88
|
+
# If we hit Cloudflare, try recreating driver with JS enabled
|
89
|
+
driver.quit()
|
90
|
+
|
91
|
+
driver = create_webdriver(web_driver, headless, user_agent, __name__)
|
92
|
+
driver.get("https://www.renfrewshire.gov.uk/bin-day")
|
93
|
+
|
94
|
+
# Wait for initial page load and Cloudflare bypass
|
95
|
+
WebDriverWait(driver, 30).until(
|
96
|
+
lambda d: "Just a moment" not in d.title and d.title != ""
|
71
97
|
)
|
72
|
-
|
98
|
+
time.sleep(5)
|
99
|
+
|
100
|
+
# Try to accept cookies if the banner appears
|
101
|
+
try:
|
102
|
+
accept_button = WebDriverWait(driver, 10).until(
|
103
|
+
EC.element_to_be_clickable((By.ID, "ccc-notify-accept"))
|
104
|
+
)
|
105
|
+
accept_button.click()
|
106
|
+
time.sleep(2)
|
107
|
+
except:
|
108
|
+
pass
|
109
|
+
|
110
|
+
# Re-enter postcode
|
111
|
+
inputElement_postcode = WebDriverWait(driver, 30).until(
|
112
|
+
EC.presence_of_element_located(
|
113
|
+
(By.ID, "RENFREWSHIREBINCOLLECTIONS_PAGE1_ADDRESSLOOKUPPOSTCODE")
|
114
|
+
)
|
115
|
+
)
|
116
|
+
inputElement_postcode.send_keys(user_postcode)
|
117
|
+
|
118
|
+
# Click search button
|
119
|
+
findAddress = WebDriverWait(driver, 10).until(
|
120
|
+
EC.presence_of_element_located(
|
121
|
+
(By.ID, "RENFREWSHIREBINCOLLECTIONS_PAGE1_ADDRESSLOOKUPSEARCH")
|
122
|
+
)
|
123
|
+
)
|
124
|
+
findAddress.click()
|
125
|
+
|
126
|
+
# Wait for the 'Select address' dropdown to appear and select option matching the house name/number
|
127
|
+
WebDriverWait(driver, 10).until(
|
128
|
+
EC.element_to_be_clickable(
|
129
|
+
(
|
130
|
+
By.XPATH,
|
131
|
+
"//select[@id='RENFREWSHIREBINCOLLECTIONS_PAGE1_ADDRESSLOOKUPADDRESS']//option[contains(., '"
|
132
|
+
+ user_paon
|
133
|
+
+ "')]",
|
134
|
+
)
|
135
|
+
)
|
136
|
+
).click()
|
137
|
+
|
138
|
+
# Handle potential second Cloudflare challenge
|
139
|
+
time.sleep(3)
|
140
|
+
if "Just a moment" in driver.page_source or "Verify you are human" in driver.page_source:
|
141
|
+
print("Second Cloudflare challenge detected, waiting...")
|
142
|
+
|
143
|
+
# Try to find and click Turnstile checkbox if present
|
144
|
+
try:
|
145
|
+
turnstile_checkbox = WebDriverWait(driver, 15).until(
|
146
|
+
EC.element_to_be_clickable((By.CSS_SELECTOR, "input[type='checkbox']"))
|
147
|
+
)
|
148
|
+
turnstile_checkbox.click()
|
149
|
+
print("Clicked Turnstile checkbox")
|
150
|
+
except:
|
151
|
+
print("No clickable Turnstile checkbox found")
|
152
|
+
|
153
|
+
# Wait for Cloudflare to complete with longer timeout
|
154
|
+
max_wait = 180 # 3 minutes
|
155
|
+
start_time = time.time()
|
156
|
+
while time.time() - start_time < max_wait:
|
157
|
+
current_source = driver.page_source
|
158
|
+
if "Just a moment" not in current_source and "Verify you are human" not in current_source:
|
159
|
+
print("Second Cloudflare challenge completed")
|
160
|
+
break
|
161
|
+
|
162
|
+
# Try clicking any visible Turnstile elements
|
163
|
+
try:
|
164
|
+
turnstile_elements = driver.find_elements(By.CSS_SELECTOR, "iframe[src*='turnstile'], div[id*='turnstile'], input[name*='turnstile']")
|
165
|
+
for element in turnstile_elements:
|
166
|
+
if element.is_displayed():
|
167
|
+
element.click()
|
168
|
+
print("Clicked Turnstile element")
|
169
|
+
break
|
170
|
+
except:
|
171
|
+
pass
|
172
|
+
|
173
|
+
time.sleep(5)
|
174
|
+
else:
|
175
|
+
print("Cloudflare challenge timeout - attempting to continue anyway")
|
176
|
+
|
177
|
+
time.sleep(10) # Extra wait after challenge
|
178
|
+
|
179
|
+
# Wait for page to change after address selection and handle dynamic loading
|
180
|
+
time.sleep(5)
|
181
|
+
|
182
|
+
# Wait for any content that indicates results are loaded
|
183
|
+
try:
|
184
|
+
WebDriverWait(driver, 30).until(
|
185
|
+
EC.presence_of_element_located((By.ID, "RENFREWSHIREBINCOLLECTIONS_PAGE1_COLLECTIONDETAILS"))
|
186
|
+
)
|
187
|
+
print("Collection details found")
|
188
|
+
except:
|
189
|
+
print("Collection details not found, checking for any collection content")
|
190
|
+
# If collection details not found, wait for page to stabilize and check for any collection content
|
191
|
+
time.sleep(10)
|
192
|
+
try:
|
193
|
+
WebDriverWait(driver, 20).until(
|
194
|
+
EC.presence_of_element_located((By.XPATH, "//*[contains(text(), 'collection') or contains(text(), 'Collection') or contains(text(), 'bin') or contains(text(), 'Bin')]"))
|
195
|
+
)
|
196
|
+
print("Found some collection-related content")
|
197
|
+
except:
|
198
|
+
print("No collection content found, proceeding anyway")
|
73
199
|
|
74
200
|
soup = BeautifulSoup(driver.page_source, features="html.parser")
|
75
201
|
|
202
|
+
# Save page source for debugging
|
203
|
+
with open("debug_renfrewshire.html", "w", encoding="utf-8") as f:
|
204
|
+
f.write(driver.page_source)
|
205
|
+
print(f"Page title: {driver.title}")
|
206
|
+
print(f"Current URL: {driver.current_url}")
|
207
|
+
|
76
208
|
next_collection_div = soup.find(
|
77
209
|
"div", {"class": "collection collection--next"}
|
78
210
|
)
|
79
211
|
|
212
|
+
if not next_collection_div:
|
213
|
+
# Check if we're still on Cloudflare page
|
214
|
+
if "Just a moment" in driver.page_source or "Verify you are human" in driver.page_source:
|
215
|
+
print("WARNING: Still on Cloudflare challenge page - this council may need manual intervention")
|
216
|
+
# Return empty data rather than failing completely
|
217
|
+
data["bins"].append({
|
218
|
+
"type": "Cloudflare Challenge - Manual Check Required",
|
219
|
+
"collectionDate": datetime.now().strftime(date_format)
|
220
|
+
})
|
221
|
+
return data
|
222
|
+
else:
|
223
|
+
# Look for any collection-related content in the page
|
224
|
+
collection_text = soup.find_all(text=lambda text: text and any(word in text.lower() for word in ["collection", "bin", "refuse", "recycling", "waste"]))
|
225
|
+
if collection_text:
|
226
|
+
print("Found collection-related text but not in expected format")
|
227
|
+
data["bins"].append({
|
228
|
+
"type": "Collection data found but format changed - Manual Check Required",
|
229
|
+
"collectionDate": datetime.now().strftime(date_format)
|
230
|
+
})
|
231
|
+
return data
|
232
|
+
else:
|
233
|
+
raise ValueError("Could not find next collection div - saved debug_renfrewshire.html")
|
234
|
+
|
235
|
+
next_collection_date_elem = next_collection_div.find("p", {"class": "collection__date"})
|
236
|
+
if not next_collection_date_elem:
|
237
|
+
raise ValueError("Could not find collection date element - saved debug_renfrewshire.html")
|
238
|
+
|
80
239
|
next_collection_date = datetime.strptime(
|
81
|
-
|
82
|
-
.get_text()
|
83
|
-
.strip(),
|
240
|
+
next_collection_date_elem.get_text().strip(),
|
84
241
|
"%A %d %B %Y",
|
85
242
|
)
|
86
243
|
|