edb-noumea 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edb_noumea/details.py +28 -2
- {edb_noumea-0.3.6.dist-info → edb_noumea-0.3.7.dist-info}/METADATA +1 -1
- edb_noumea-0.3.7.dist-info/RECORD +8 -0
- edb_noumea-0.3.6.dist-info/RECORD +0 -8
- {edb_noumea-0.3.6.dist-info → edb_noumea-0.3.7.dist-info}/WHEEL +0 -0
- {edb_noumea-0.3.6.dist-info → edb_noumea-0.3.7.dist-info}/licenses/LICENSE +0 -0
- {edb_noumea-0.3.6.dist-info → edb_noumea-0.3.7.dist-info}/top_level.txt +0 -0
edb_noumea/details.py
CHANGED
|
@@ -107,6 +107,32 @@ def get_detailed_results():
|
|
|
107
107
|
# It seems to start at index 6 in the camelot df.
|
|
108
108
|
df = df.iloc[6:].reset_index(drop=True)
|
|
109
109
|
|
|
110
|
+
# Apply ffill to all relevant columns after initial column assignment and before specific cleaning
|
|
111
|
+
# Replace empty strings with pd.NA first to ensure ffill works
|
|
112
|
+
for col in df.columns:
|
|
113
|
+
df[col] = df[col].replace('', pd.NA)
|
|
114
|
+
|
|
115
|
+
# Now, forward-fill all columns. This assumes that empty cells in a data block should inherit from the previous non-empty cell.
|
|
116
|
+
df = df.ffill()
|
|
117
|
+
|
|
118
|
+
# Remove trailing informational rows
|
|
119
|
+
# Identify the patterns of the rows to be removed
|
|
120
|
+
unwanted_patterns = [
|
|
121
|
+
"La qualité des eaux de baignade est évaluée au",
|
|
122
|
+
"de présence de germes pathogènes",
|
|
123
|
+
"Suivant l'arrêté N°", # Added to catch more informational rows
|
|
124
|
+
"Indicateurs microbiologiques" # Added to catch more informational rows
|
|
125
|
+
]
|
|
126
|
+
|
|
127
|
+
# Filter out rows that contain these patterns in the 'Nom du site de baignade' or 'Point de prélèvement' column
|
|
128
|
+
# We check both columns as the content might shift
|
|
129
|
+
initial_row_count = df.shape[0]
|
|
130
|
+
df = df[~df['Nom du site de baignade'].fillna('').astype(str).str.contains('|'.join(unwanted_patterns), case=False, na=False)]
|
|
131
|
+
df = df[~df['Point de prélèvement'].fillna('').astype(str).str.contains('|'.join(unwanted_patterns), case=False, na=False)]
|
|
132
|
+
|
|
133
|
+
if df.shape[0] < initial_row_count:
|
|
134
|
+
print(f"Removed {initial_row_count - df.shape[0]} informational rows.")
|
|
135
|
+
|
|
110
136
|
except Exception as e:
|
|
111
137
|
print(f"❌ Une erreur est survenue lors de l'extraction des données du PDF avec Camelot.")
|
|
112
138
|
print(f" Erreur originale : {e}")
|
|
@@ -171,11 +197,11 @@ def get_detailed_results():
|
|
|
171
197
|
|
|
172
198
|
# Nettoyer et convertir les colonnes e_coli_npp_100ml et enterocoques_npp_100ml
|
|
173
199
|
if "e_coli_npp_100ml" in cleaned_df.columns:
|
|
174
|
-
cleaned_df["e_coli_npp_100ml"] = cleaned_df["e_coli_npp_100ml"].astype(str).str.replace(r"<\s*10", "10", regex=True)
|
|
200
|
+
cleaned_df["e_coli_npp_100ml"] = cleaned_df["e_coli_npp_100ml"].astype(str).str.replace(" ", "", regex=False).str.replace(r"<\s*10", "10", regex=True)
|
|
175
201
|
cleaned_df["e_coli_npp_100ml"] = pd.to_numeric(cleaned_df["e_coli_npp_100ml"], errors="coerce").astype('Int64')
|
|
176
202
|
|
|
177
203
|
if "enterocoques_npp_100ml" in cleaned_df.columns:
|
|
178
|
-
cleaned_df["enterocoques_npp_100ml"] = cleaned_df["enterocoques_npp_100ml"].astype(str).str.replace(r"<\s*10", "10", regex=True)
|
|
204
|
+
cleaned_df["enterocoques_npp_100ml"] = cleaned_df["enterocoques_npp_100ml"].astype(str).str.replace(" ", "", regex=False).str.replace(r"<\s*10", "10", regex=True)
|
|
179
205
|
cleaned_df["enterocoques_npp_100ml"] = pd.to_numeric(cleaned_df["enterocoques_npp_100ml"], errors="coerce").astype('Int64')
|
|
180
206
|
|
|
181
207
|
# Convertir la colonne 'date' en datetime (format jour/mois/année)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: edb-noumea
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.7
|
|
4
4
|
Summary: Scraper pour la qualité des eaux de baignade à Nouméa.
|
|
5
5
|
Project-URL: Homepage, https://github.com/adriens/edb-noumea
|
|
6
6
|
Project-URL: Repository, https://github.com/adriens/edb-noumea
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
edb_noumea/__init__.py,sha256=G7WKTGLsr2wtW1E2jYpqq4miZLoSGhTifSE36CGNkLo,60
|
|
2
|
+
edb_noumea/details.py,sha256=vdHlaohUBtGtNCi46DrxUxYE34UHjBw4MKz3B_A1o6k,10379
|
|
3
|
+
edb_noumea/main.py,sha256=ekx8XF7b3-w63AuVrZqVPg7IKHRL0cZGYmudsHa6DTk,2137
|
|
4
|
+
edb_noumea-0.3.7.dist-info/licenses/LICENSE,sha256=mNQ0SS064BtPKYHabMRg2yM3m-GDX4MgDQ6ZnDFiueI,1100
|
|
5
|
+
edb_noumea-0.3.7.dist-info/METADATA,sha256=VdFm-Hozr9L5XlGh06DLBqyQlHGzon1nXqgtSjdpEa0,7126
|
|
6
|
+
edb_noumea-0.3.7.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
7
|
+
edb_noumea-0.3.7.dist-info/top_level.txt,sha256=Dj3JusM0b5H9_f9yZeO-IwucCZzI1OHSjLMKtvRjq6k,11
|
|
8
|
+
edb_noumea-0.3.7.dist-info/RECORD,,
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
edb_noumea/__init__.py,sha256=G7WKTGLsr2wtW1E2jYpqq4miZLoSGhTifSE36CGNkLo,60
|
|
2
|
-
edb_noumea/details.py,sha256=n9FgkNPzEAZf9HQxTlU7OWjyofi57O00zIqQghyarE8,8821
|
|
3
|
-
edb_noumea/main.py,sha256=ekx8XF7b3-w63AuVrZqVPg7IKHRL0cZGYmudsHa6DTk,2137
|
|
4
|
-
edb_noumea-0.3.6.dist-info/licenses/LICENSE,sha256=mNQ0SS064BtPKYHabMRg2yM3m-GDX4MgDQ6ZnDFiueI,1100
|
|
5
|
-
edb_noumea-0.3.6.dist-info/METADATA,sha256=xAqPGKMwfOvz_HF65QvBcWvyrK8iwGTY5QwwcJmMmYU,7126
|
|
6
|
-
edb_noumea-0.3.6.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
7
|
-
edb_noumea-0.3.6.dist-info/top_level.txt,sha256=Dj3JusM0b5H9_f9yZeO-IwucCZzI1OHSjLMKtvRjq6k,11
|
|
8
|
-
edb_noumea-0.3.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|