edb-noumea 0.3.5__tar.gz → 0.3.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {edb_noumea-0.3.5 → edb_noumea-0.3.7}/PKG-INFO +2 -1
- {edb_noumea-0.3.5 → edb_noumea-0.3.7}/edb_noumea/details.py +73 -30
- {edb_noumea-0.3.5 → edb_noumea-0.3.7}/edb_noumea.egg-info/PKG-INFO +2 -1
- {edb_noumea-0.3.5 → edb_noumea-0.3.7}/edb_noumea.egg-info/requires.txt +1 -0
- {edb_noumea-0.3.5 → edb_noumea-0.3.7}/pyproject.toml +2 -3
- {edb_noumea-0.3.5 → edb_noumea-0.3.7}/LICENSE +0 -0
- {edb_noumea-0.3.5 → edb_noumea-0.3.7}/README.md +0 -0
- {edb_noumea-0.3.5 → edb_noumea-0.3.7}/edb_noumea/__init__.py +0 -0
- {edb_noumea-0.3.5 → edb_noumea-0.3.7}/edb_noumea/main.py +0 -0
- {edb_noumea-0.3.5 → edb_noumea-0.3.7}/edb_noumea.egg-info/SOURCES.txt +0 -0
- {edb_noumea-0.3.5 → edb_noumea-0.3.7}/edb_noumea.egg-info/dependency_links.txt +0 -0
- {edb_noumea-0.3.5 → edb_noumea-0.3.7}/edb_noumea.egg-info/top_level.txt +0 -0
- {edb_noumea-0.3.5 → edb_noumea-0.3.7}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: edb-noumea
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.7
|
|
4
4
|
Summary: Scraper pour la qualité des eaux de baignade à Nouméa.
|
|
5
5
|
Project-URL: Homepage, https://github.com/adriens/edb-noumea
|
|
6
6
|
Project-URL: Repository, https://github.com/adriens/edb-noumea
|
|
@@ -11,6 +11,7 @@ Requires-Dist: beautifulsoup4
|
|
|
11
11
|
Requires-Dist: pandas
|
|
12
12
|
Requires-Dist: lxml
|
|
13
13
|
Requires-Dist: pdfplumber
|
|
14
|
+
Requires-Dist: camelot-py[cv]
|
|
14
15
|
Requires-Dist: matplotlib
|
|
15
16
|
Dynamic: license-file
|
|
16
17
|
|
|
@@ -1,4 +1,9 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
|
+
import requests
|
|
3
|
+
from bs4 import BeautifulSoup
|
|
4
|
+
import camelot
|
|
5
|
+
import tempfile
|
|
6
|
+
import os
|
|
2
7
|
|
|
3
8
|
@staticmethod
|
|
4
9
|
def get_sites():
|
|
@@ -14,22 +19,16 @@ def get_sites():
|
|
|
14
19
|
{"site": "PLAGE DU KUENDU BEACH", "plage": "Plage du Kuendu Beach", "gmaps_url": "https://maps.app.goo.gl/oGY6Hy4KCXJWxqfL9"},
|
|
15
20
|
]
|
|
16
21
|
return pd.DataFrame(data)
|
|
22
|
+
|
|
17
23
|
def get_pdf_url():
|
|
18
24
|
"""
|
|
19
25
|
Alias public pour obtenir l'URL du dernier PDF d'analyses détaillées.
|
|
20
26
|
"""
|
|
21
27
|
return get_latest_pdf_url()
|
|
22
28
|
|
|
23
|
-
import pandas as pd
|
|
24
|
-
import pdfplumber
|
|
25
|
-
import requests
|
|
26
|
-
import io
|
|
27
|
-
from bs4 import BeautifulSoup
|
|
28
|
-
|
|
29
29
|
# URL de la page officielle contenant le lien vers le PDF
|
|
30
30
|
PAGE_URL = "https://www.noumea.nc/noumea-pratique/salubrite-publique/qualite-eaux-baignade"
|
|
31
31
|
|
|
32
|
-
|
|
33
32
|
def get_latest_pdf_url():
|
|
34
33
|
"""
|
|
35
34
|
Récupère dynamiquement l'URL du dernier PDF d'analyses détaillées depuis la page officielle.
|
|
@@ -58,7 +57,7 @@ def get_latest_pdf_url():
|
|
|
58
57
|
def get_detailed_results():
|
|
59
58
|
"""
|
|
60
59
|
Télécharge dynamiquement le PDF des résultats détaillés, en extrait le premier tableau
|
|
61
|
-
et le retourne sous forme de DataFrame pandas.
|
|
60
|
+
avec Camelot et le retourne sous forme de DataFrame pandas.
|
|
62
61
|
"""
|
|
63
62
|
pdf_url = get_latest_pdf_url()
|
|
64
63
|
if not pdf_url:
|
|
@@ -73,31 +72,75 @@ def get_detailed_results():
|
|
|
73
72
|
print(f"❌ Erreur lors du téléchargement du fichier PDF : {e}")
|
|
74
73
|
return None
|
|
75
74
|
|
|
76
|
-
|
|
75
|
+
# Utiliser un fichier temporaire pour que Camelot puisse le lire
|
|
76
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
|
|
77
|
+
temp_pdf.write(response.content)
|
|
78
|
+
temp_pdf_path = temp_pdf.name
|
|
77
79
|
|
|
78
80
|
try:
|
|
79
|
-
print("🔍 Extraction des tableaux du PDF avec
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
first_page = pdf.pages[0]
|
|
86
|
-
tables = first_page.extract_tables()
|
|
87
|
-
|
|
88
|
-
if not tables:
|
|
89
|
-
print("❌ Aucun tableau n'a été trouvé dans le PDF.")
|
|
90
|
-
return None
|
|
91
|
-
|
|
92
|
-
print(f"✅ {len(tables)} tableau(x) trouvé(s) sur la première page.")
|
|
93
|
-
# Convertir le premier tableau en DataFrame
|
|
94
|
-
table_data = tables[0]
|
|
95
|
-
df = pd.DataFrame(table_data[1:], columns=table_data[0])
|
|
81
|
+
print("🔍 Extraction des tableaux du PDF avec Camelot (flavor='stream')...")
|
|
82
|
+
tables = camelot.read_pdf(temp_pdf_path, flavor='stream', pages='1')
|
|
83
|
+
|
|
84
|
+
if not tables:
|
|
85
|
+
print("❌ Aucun tableau n'a été trouvé dans le PDF avec Camelot.")
|
|
86
|
+
return None
|
|
96
87
|
|
|
88
|
+
print(f"✅ {len(tables)} tableau(x) trouvé(s) sur la première page.")
|
|
89
|
+
# Le DataFrame est directement accessible avec .df
|
|
90
|
+
df = tables[0].df
|
|
91
|
+
|
|
92
|
+
# The header is messy. Let's explicitly define the columns we expect.
|
|
93
|
+
new_columns = [
|
|
94
|
+
"Nom du site de baignade",
|
|
95
|
+
"Point de prélèvement",
|
|
96
|
+
"Date du prélèvement",
|
|
97
|
+
"Heure du prélèvement",
|
|
98
|
+
"Escherichia coli (NPP/100ml)",
|
|
99
|
+
"Entérocoques intestinaux (NPP/100ml)"
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
# The data from camelot has 6 columns, which matches our expected columns.
|
|
103
|
+
# Let's assign these names.
|
|
104
|
+
df.columns = new_columns
|
|
105
|
+
|
|
106
|
+
# Now, we need to find where the actual data starts.
|
|
107
|
+
# It seems to start at index 6 in the camelot df.
|
|
108
|
+
df = df.iloc[6:].reset_index(drop=True)
|
|
109
|
+
|
|
110
|
+
# Apply ffill to all relevant columns after initial column assignment and before specific cleaning
|
|
111
|
+
# Replace empty strings with pd.NA first to ensure ffill works
|
|
112
|
+
for col in df.columns:
|
|
113
|
+
df[col] = df[col].replace('', pd.NA)
|
|
114
|
+
|
|
115
|
+
# Now, forward-fill all columns. This assumes that empty cells in a data block should inherit from the previous non-empty cell.
|
|
116
|
+
df = df.ffill()
|
|
117
|
+
|
|
118
|
+
# Remove trailing informational rows
|
|
119
|
+
# Identify the patterns of the rows to be removed
|
|
120
|
+
unwanted_patterns = [
|
|
121
|
+
"La qualité des eaux de baignade est évaluée au",
|
|
122
|
+
"de présence de germes pathogènes",
|
|
123
|
+
"Suivant l'arrêté N°", # Added to catch more informational rows
|
|
124
|
+
"Indicateurs microbiologiques" # Added to catch more informational rows
|
|
125
|
+
]
|
|
126
|
+
|
|
127
|
+
# Filter out rows that contain these patterns in the 'Nom du site de baignade' or 'Point de prélèvement' column
|
|
128
|
+
# We check both columns as the content might shift
|
|
129
|
+
initial_row_count = df.shape[0]
|
|
130
|
+
df = df[~df['Nom du site de baignade'].fillna('').astype(str).str.contains('|'.join(unwanted_patterns), case=False, na=False)]
|
|
131
|
+
df = df[~df['Point de prélèvement'].fillna('').astype(str).str.contains('|'.join(unwanted_patterns), case=False, na=False)]
|
|
132
|
+
|
|
133
|
+
if df.shape[0] < initial_row_count:
|
|
134
|
+
print(f"Removed {initial_row_count - df.shape[0]} informational rows.")
|
|
135
|
+
|
|
97
136
|
except Exception as e:
|
|
98
|
-
print(f"❌ Une erreur est survenue lors de l'extraction des données du PDF.")
|
|
137
|
+
print(f"❌ Une erreur est survenue lors de l'extraction des données du PDF avec Camelot.")
|
|
99
138
|
print(f" Erreur originale : {e}")
|
|
100
139
|
return None
|
|
140
|
+
finally:
|
|
141
|
+
# Nettoyer le fichier temporaire
|
|
142
|
+
if 'temp_pdf_path' in locals() and os.path.exists(temp_pdf_path):
|
|
143
|
+
os.remove(temp_pdf_path)
|
|
101
144
|
|
|
102
145
|
print("\n--- Aperçu du tableau extrait (toutes colonnes) ---")
|
|
103
146
|
with pd.option_context('display.max_columns', None):
|
|
@@ -154,11 +197,11 @@ def get_detailed_results():
|
|
|
154
197
|
|
|
155
198
|
# Nettoyer et convertir les colonnes e_coli_npp_100ml et enterocoques_npp_100ml
|
|
156
199
|
if "e_coli_npp_100ml" in cleaned_df.columns:
|
|
157
|
-
cleaned_df["e_coli_npp_100ml"] = cleaned_df["e_coli_npp_100ml"].astype(str).str.replace(r"<\s*10", "10", regex=True)
|
|
200
|
+
cleaned_df["e_coli_npp_100ml"] = cleaned_df["e_coli_npp_100ml"].astype(str).str.replace(" ", "", regex=False).str.replace(r"<\s*10", "10", regex=True)
|
|
158
201
|
cleaned_df["e_coli_npp_100ml"] = pd.to_numeric(cleaned_df["e_coli_npp_100ml"], errors="coerce").astype('Int64')
|
|
159
202
|
|
|
160
203
|
if "enterocoques_npp_100ml" in cleaned_df.columns:
|
|
161
|
-
cleaned_df["enterocoques_npp_100ml"] = cleaned_df["enterocoques_npp_100ml"].astype(str).str.replace(r"<\s*10", "10", regex=True)
|
|
204
|
+
cleaned_df["enterocoques_npp_100ml"] = cleaned_df["enterocoques_npp_100ml"].astype(str).str.replace(" ", "", regex=False).str.replace(r"<\s*10", "10", regex=True)
|
|
162
205
|
cleaned_df["enterocoques_npp_100ml"] = pd.to_numeric(cleaned_df["enterocoques_npp_100ml"], errors="coerce").astype('Int64')
|
|
163
206
|
|
|
164
207
|
# Convertir la colonne 'date' en datetime (format jour/mois/année)
|
|
@@ -182,4 +225,4 @@ if __name__ == "__main__":
|
|
|
182
225
|
]])
|
|
183
226
|
# Export CSV
|
|
184
227
|
detailed_df.to_csv("details_dernier_releve.csv", index=False)
|
|
185
|
-
print("\n✅ Export CSV : details_dernier_releve.csv")
|
|
228
|
+
print("\n✅ Export CSV : details_dernier_releve.csv")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: edb-noumea
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.7
|
|
4
4
|
Summary: Scraper pour la qualité des eaux de baignade à Nouméa.
|
|
5
5
|
Project-URL: Homepage, https://github.com/adriens/edb-noumea
|
|
6
6
|
Project-URL: Repository, https://github.com/adriens/edb-noumea
|
|
@@ -11,6 +11,7 @@ Requires-Dist: beautifulsoup4
|
|
|
11
11
|
Requires-Dist: pandas
|
|
12
12
|
Requires-Dist: lxml
|
|
13
13
|
Requires-Dist: pdfplumber
|
|
14
|
+
Requires-Dist: camelot-py[cv]
|
|
14
15
|
Requires-Dist: matplotlib
|
|
15
16
|
Dynamic: license-file
|
|
16
17
|
|
|
@@ -2,15 +2,14 @@
|
|
|
2
2
|
name = "edb-noumea"
|
|
3
3
|
description = "Scraper pour la qualité des eaux de baignade à Nouméa."
|
|
4
4
|
readme = "README.md"
|
|
5
|
-
version = "0.3.
|
|
5
|
+
version = "0.3.7"
|
|
6
6
|
dependencies = [
|
|
7
7
|
"requests",
|
|
8
8
|
"beautifulsoup4",
|
|
9
9
|
"pandas",
|
|
10
10
|
"lxml",
|
|
11
11
|
"pdfplumber",
|
|
12
|
-
|
|
13
|
-
|
|
12
|
+
"camelot-py[cv]",
|
|
14
13
|
"matplotlib",
|
|
15
14
|
]
|
|
16
15
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|