istat-census-data 1.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- istat_census_data-1.4.1/LICENSE +21 -0
- istat_census_data-1.4.1/PKG-INFO +92 -0
- istat_census_data-1.4.1/README.md +55 -0
- istat_census_data-1.4.1/istatcelldata/__init__.py +49 -0
- istat_census_data-1.4.1/istatcelldata/census1991/__init__.py +0 -0
- istat_census_data-1.4.1/istatcelldata/census1991/download.py +121 -0
- istat_census_data-1.4.1/istatcelldata/census1991/process.py +136 -0
- istat_census_data-1.4.1/istatcelldata/census1991/utils.py +154 -0
- istat_census_data-1.4.1/istatcelldata/census2001/__init__.py +0 -0
- istat_census_data-1.4.1/istatcelldata/census2001/download.py +60 -0
- istat_census_data-1.4.1/istatcelldata/census2011/__init__.py +0 -0
- istat_census_data-1.4.1/istatcelldata/census2011/download.py +228 -0
- istat_census_data-1.4.1/istatcelldata/census2021/__init__.py +0 -0
- istat_census_data-1.4.1/istatcelldata/census2021/download.py +109 -0
- istat_census_data-1.4.1/istatcelldata/census2021/utils.py +62 -0
- istat_census_data-1.4.1/istatcelldata/config.py +303 -0
- istat_census_data-1.4.1/istatcelldata/data.py +154 -0
- istat_census_data-1.4.1/istatcelldata/download.py +87 -0
- istat_census_data-1.4.1/istatcelldata/executor/__init__.py +0 -0
- istat_census_data-1.4.1/istatcelldata/executor/download.py +76 -0
- istat_census_data-1.4.1/istatcelldata/executor/preprocess.py +189 -0
- istat_census_data-1.4.1/istatcelldata/executor/process.py +126 -0
- istat_census_data-1.4.1/istatcelldata/geodata.py +396 -0
- istat_census_data-1.4.1/istatcelldata/logger_config.py +106 -0
- istat_census_data-1.4.1/istatcelldata/utils.py +322 -0
- istat_census_data-1.4.1/pyproject.toml +109 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2023 Massimiliano Moraca
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: istat-census-data
|
|
3
|
+
Version: 1.4.1
|
|
4
|
+
Summary: Python package to download and process ISTAT census grid cell data for Italian population censuses (1991-2021)
|
|
5
|
+
License: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Keywords: istat,census,gis,geospatial,italy,demographics,population
|
|
8
|
+
Author: Massimiliano Moraca
|
|
9
|
+
Author-email: gis.massimilianomoraca@gmail.com
|
|
10
|
+
Requires-Python: >=3.10,<4.0
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Natural Language :: English
|
|
16
|
+
Classifier: Natural Language :: Italian
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering :: GIS
|
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
26
|
+
Requires-Dist: chardet (>=5.2.0,<6.0.0)
|
|
27
|
+
Requires-Dist: geopandas (>=1.0.1,<2.0.0)
|
|
28
|
+
Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
|
|
29
|
+
Requires-Dist: requests (>=2.32.0,<3.0.0)
|
|
30
|
+
Requires-Dist: tqdm (>=4.66.5,<5.0.0)
|
|
31
|
+
Requires-Dist: xlrd (>=2.0.1,<3.0.0)
|
|
32
|
+
Project-URL: Documentation, https://maxdragonheart.github.io/istat-census-data/
|
|
33
|
+
Project-URL: Homepage, https://maxdragonheart.github.io/istat-census-data/
|
|
34
|
+
Project-URL: Repository, https://github.com/MaxDragonheart/istat-census-data
|
|
35
|
+
Description-Content-Type: text/markdown
|
|
36
|
+
|
|
37
|
+
# ISTAT Cell Data
|
|
38
|
+
|
|
39
|
+
[](https://github.com/MaxDragonheart/istat-census-data/actions/workflows/ci.yml)
|
|
40
|
+
[](https://maxdragonheart.github.io/istat-census-data/)
|
|
41
|
+
[](https://www.python.org/downloads/)
|
|
42
|
+
[](https://badge.fury.io/py/istat-census-data)
|
|
43
|
+
[](https://opensource.org/licenses/MIT)
|
|
44
|
+
|
|
45
|
+
Con **ISTAT Cell Data** puoi ottenere facilmente il set di dati riferiti ai censimenti ISTAT in cui sono stati
|
|
46
|
+
rilasciati anche i dati delle celle censuarie.
|
|
47
|
+
|
|
48
|
+
E' possibile scaricare e processare i dati dal 1991 al 2021.
|
|
49
|
+
|
|
50
|
+
## Installazione
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
pip install istat-census-data
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Rinomina del progetto e manutenzione
|
|
57
|
+
|
|
58
|
+
`istat-census-data` è la distribuzione PyPI mantenuta a partire dalla versione 1.4.0
|
|
59
|
+
ed è la continuazione del precedente progetto `istatcelldata`.
|
|
60
|
+
|
|
61
|
+
Il nome della distribuzione PyPI è `istat-census-data`, mentre il package Python
|
|
62
|
+
da importare resta `istatcelldata`:
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
import istatcelldata
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
La vecchia distribuzione/repository `istatcelldata` non è più mantenuta e non riceverà
|
|
69
|
+
nuove release. Aggiorna le dipendenze di installazione da `istatcelldata` a
|
|
70
|
+
`istat-census-data`; il codice Python esistente può continuare a usare
|
|
71
|
+
`import istatcelldata`.
|
|
72
|
+
|
|
73
|
+
## Pubblicazione release
|
|
74
|
+
|
|
75
|
+
Le nuove release PyPI devono essere pubblicate da GitHub Actions tramite PyPI Trusted
|
|
76
|
+
Publishing. Il workflow di rilascio non usa token PyPI salvati in locale o nei GitHub
|
|
77
|
+
secrets.
|
|
78
|
+
|
|
79
|
+
Prima della prima pubblicazione, configura su PyPI un pending Trusted Publisher che
|
|
80
|
+
corrisponda al workflow di rilascio del repository e usa un environment GitHub protetto
|
|
81
|
+
per approvare la pubblicazione.
|
|
82
|
+
|
|
83
|
+
Per pubblicare, crea una GitHub Release sul tag che corrisponde alla versione in
|
|
84
|
+
`pyproject.toml`. Lo script `release.sh` resta solo una verifica locale pre-rilascio:
|
|
85
|
+
non pubblica più su PyPI e non effettua il deploy della documentazione.
|
|
86
|
+
|
|
87
|
+
!!! INFO
|
|
88
|
+
|
|
89
|
+
Questo progetto non è collegato ad ISTAT nè supportato da ISTAT ed è una iniziativa autonoma di [Massimiliano Moraca](https://massimilianomoraca.me/).
|
|
90
|
+
|
|
91
|
+
Questo repository è stato creato grazie a [MkDocs](https://www.mkdocs.org/), [Material for MkDocs](https://squidfunk.github.io/mkdocs-material) e [mkdocstring](https://mkdocstrings.github.io/).
|
|
92
|
+
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# ISTAT Cell Data
|
|
2
|
+
|
|
3
|
+
[](https://github.com/MaxDragonheart/istat-census-data/actions/workflows/ci.yml)
|
|
4
|
+
[](https://maxdragonheart.github.io/istat-census-data/)
|
|
5
|
+
[](https://www.python.org/downloads/)
|
|
6
|
+
[](https://badge.fury.io/py/istat-census-data)
|
|
7
|
+
[](https://opensource.org/licenses/MIT)
|
|
8
|
+
|
|
9
|
+
Con **ISTAT Cell Data** puoi ottenere facilmente il set di dati riferiti ai censimenti ISTAT in cui sono stati
|
|
10
|
+
rilasciati anche i dati delle celle censuarie.
|
|
11
|
+
|
|
12
|
+
E' possibile scaricare e processare i dati dal 1991 al 2021.
|
|
13
|
+
|
|
14
|
+
## Installazione
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
pip install istat-census-data
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Rinomina del progetto e manutenzione
|
|
21
|
+
|
|
22
|
+
`istat-census-data` è la distribuzione PyPI mantenuta a partire dalla versione 1.4.0
|
|
23
|
+
ed è la continuazione del precedente progetto `istatcelldata`.
|
|
24
|
+
|
|
25
|
+
Il nome della distribuzione PyPI è `istat-census-data`, mentre il package Python
|
|
26
|
+
da importare resta `istatcelldata`:
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
import istatcelldata
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
La vecchia distribuzione/repository `istatcelldata` non è più mantenuta e non riceverà
|
|
33
|
+
nuove release. Aggiorna le dipendenze di installazione da `istatcelldata` a
|
|
34
|
+
`istat-census-data`; il codice Python esistente può continuare a usare
|
|
35
|
+
`import istatcelldata`.
|
|
36
|
+
|
|
37
|
+
## Pubblicazione release
|
|
38
|
+
|
|
39
|
+
Le nuove release PyPI devono essere pubblicate da GitHub Actions tramite PyPI Trusted
|
|
40
|
+
Publishing. Il workflow di rilascio non usa token PyPI salvati in locale o nei GitHub
|
|
41
|
+
secrets.
|
|
42
|
+
|
|
43
|
+
Prima della prima pubblicazione, configura su PyPI un pending Trusted Publisher che
|
|
44
|
+
corrisponda al workflow di rilascio del repository e usa un environment GitHub protetto
|
|
45
|
+
per approvare la pubblicazione.
|
|
46
|
+
|
|
47
|
+
Per pubblicare, crea una GitHub Release sul tag che corrisponde alla versione in
|
|
48
|
+
`pyproject.toml`. Lo script `release.sh` resta solo una verifica locale pre-rilascio:
|
|
49
|
+
non pubblica più su PyPI e non effettua il deploy della documentazione.
|
|
50
|
+
|
|
51
|
+
!!! INFO
|
|
52
|
+
|
|
53
|
+
Questo progetto non è collegato ad ISTAT nè supportato da ISTAT ed è una iniziativa autonoma di [Massimiliano Moraca](https://massimilianomoraca.me/).
|
|
54
|
+
|
|
55
|
+
Questo repository è stato creato grazie a [MkDocs](https://www.mkdocs.org/), [Material for MkDocs](https://squidfunk.github.io/mkdocs-material) e [mkdocstring](https://mkdocstrings.github.io/).
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""ISTAT Cell Data - Italian census cell data processing library.
|
|
2
|
+
|
|
3
|
+
ISTAT Cell Data provides easy access to Italian National Institute of Statistics
|
|
4
|
+
(ISTAT) census data, including census grid cell datasets from complete population
|
|
5
|
+
censuses.
|
|
6
|
+
|
|
7
|
+
This library allows you to download and process census data from 1991 to 2021,
|
|
8
|
+
including demographic data and geographic information (shapefiles, administrative
|
|
9
|
+
boundaries) for census sections across Italy.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
__version__ = "1.3.0"
|
|
13
|
+
__author__ = "Massimiliano Moraca"
|
|
14
|
+
__email__ = "gis.massimilianomoraca@gmail.com"
|
|
15
|
+
|
|
16
|
+
# Expose main modules for convenient imports
|
|
17
|
+
from istatcelldata import census1991, census2001, census2011, census2021, executor
|
|
18
|
+
from istatcelldata.config import DOWNLOAD_RAW_DATA, census_data
|
|
19
|
+
from istatcelldata.data import preprocess_data
|
|
20
|
+
from istatcelldata.download import download_base
|
|
21
|
+
from istatcelldata.executor.process import finalize_census_data
|
|
22
|
+
from istatcelldata.geodata import (
|
|
23
|
+
preprocess_geodata,
|
|
24
|
+
read_administrative_boundaries,
|
|
25
|
+
read_census,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
# Version info
|
|
30
|
+
"__version__",
|
|
31
|
+
"__author__",
|
|
32
|
+
"__email__",
|
|
33
|
+
# Submodules
|
|
34
|
+
"census1991",
|
|
35
|
+
"census2001",
|
|
36
|
+
"census2011",
|
|
37
|
+
"census2021",
|
|
38
|
+
"executor",
|
|
39
|
+
# Configuration
|
|
40
|
+
"census_data",
|
|
41
|
+
"DOWNLOAD_RAW_DATA",
|
|
42
|
+
# Main functions
|
|
43
|
+
"download_base",
|
|
44
|
+
"preprocess_data",
|
|
45
|
+
"read_administrative_boundaries",
|
|
46
|
+
"read_census",
|
|
47
|
+
"preprocess_geodata",
|
|
48
|
+
"finalize_census_data",
|
|
49
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from istatcelldata.census1991.utils import census_trace, read_xls
|
|
5
|
+
from istatcelldata.census2011.download import download_administrative_boundaries, download_geodata
|
|
6
|
+
from istatcelldata.census2011.download import download_data as dwn
|
|
7
|
+
from istatcelldata.config import CENSUS_DATA_FOLDER, DATA_FOLDER, PREPROCESSING_FOLDER
|
|
8
|
+
from istatcelldata.utils import get_census_dictionary, remove_files
|
|
9
|
+
|
|
10
|
+
# Define the logger as a global variable
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def download_data(output_data_folder: Path, census_year: int) -> Path:
|
|
15
|
+
"""Download, organize, and process census data for a specific year.
|
|
16
|
+
|
|
17
|
+
This function manages the complete workflow for acquiring census data from
|
|
18
|
+
source through to producing final CSV files. The following operations are
|
|
19
|
+
performed:
|
|
20
|
+
|
|
21
|
+
1. Retrieval of the dictionary of links for the census year.
|
|
22
|
+
2. Download of raw data via the `dwn()` function.
|
|
23
|
+
3. Creation of the output folder structure.
|
|
24
|
+
4. Identification and reading of `.xls` files.
|
|
25
|
+
5. Conversion of Excel files to CSV.
|
|
26
|
+
6. Extraction of tracking metadata (codifications) from the first available file.
|
|
27
|
+
7. Removal of original Excel files.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
output_data_folder: Root folder path where downloaded data will be saved.
|
|
31
|
+
census_year: Reference year for the census data to process.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
Path to the folder containing the downloaded and processed census data.
|
|
35
|
+
|
|
36
|
+
Raises:
|
|
37
|
+
Exception: If no `.xls` file is found in the data folder.
|
|
38
|
+
|
|
39
|
+
Note:
|
|
40
|
+
Conversion from XLS to CSV is performed via the `read_xls()` function.
|
|
41
|
+
Dataset tracking is performed only on the first XLS file found.
|
|
42
|
+
XLS files are removed at the end of the process to reduce disk space usage.
|
|
43
|
+
"""
|
|
44
|
+
link_dict = get_census_dictionary(census_year=census_year)
|
|
45
|
+
census_code = link_dict[f"census{census_year}"]["census_code"]
|
|
46
|
+
|
|
47
|
+
data_folder = dwn(output_data_folder=output_data_folder, census_year=census_year)
|
|
48
|
+
|
|
49
|
+
final_folder = data_folder.joinpath(DATA_FOLDER, CENSUS_DATA_FOLDER)
|
|
50
|
+
Path(final_folder).mkdir(parents=True, exist_ok=True)
|
|
51
|
+
|
|
52
|
+
# Esegui il tracciamento dei dati dal primo file XLS trovato
|
|
53
|
+
files_list = list(data_folder.rglob("*.xls"))
|
|
54
|
+
if not files_list:
|
|
55
|
+
logging.error("Nessun file XLS trovato nella cartella dei dati.")
|
|
56
|
+
raise Exception("Nessun file XLS trovato per il tracciamento.")
|
|
57
|
+
|
|
58
|
+
logging.info("Estrazione dei dati censuari in formato xls e conversione in csv.")
|
|
59
|
+
# Convert xls to csv
|
|
60
|
+
for file_path in files_list:
|
|
61
|
+
read_xls(file_path=file_path, census_code=census_code, output_path=final_folder)
|
|
62
|
+
|
|
63
|
+
first_element = files_list[0]
|
|
64
|
+
logging.info(f"Extracting data trace from file {first_element}")
|
|
65
|
+
census_trace(file_path=first_element, year=census_year, output_path=final_folder)
|
|
66
|
+
|
|
67
|
+
# Remove unnecessary XLS files
|
|
68
|
+
logging.info(f"Removing XLS files from folder {data_folder}")
|
|
69
|
+
remove_files(files_path=files_list)
|
|
70
|
+
|
|
71
|
+
logging.info(f"Census data download completed and saved in {data_folder}")
|
|
72
|
+
return data_folder
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def download_all_census_data_1991(
|
|
76
|
+
output_data_folder: Path, region_list: list[int] | None = None
|
|
77
|
+
) -> Path:
|
|
78
|
+
"""Download complete census and geographic dataset for the 1991 Census.
|
|
79
|
+
|
|
80
|
+
This function coordinates all necessary operations to obtain census data
|
|
81
|
+
and geographic information associated with the 1991 Census. It enables
|
|
82
|
+
downloading of:
|
|
83
|
+
|
|
84
|
+
- Tabular census data
|
|
85
|
+
- Geodata specific to one or more regions
|
|
86
|
+
- Official administrative boundaries
|
|
87
|
+
|
|
88
|
+
If no value is provided for `region_list`, geodata for all regions is
|
|
89
|
+
downloaded.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
output_data_folder: Main path where all downloaded and processed data
|
|
93
|
+
will be saved.
|
|
94
|
+
region_list: List containing region codes or names for which to download
|
|
95
|
+
geodata. If empty, all available regions are considered.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
Path to the root folder containing the downloaded data.
|
|
99
|
+
|
|
100
|
+
Note:
|
|
101
|
+
This function operates exclusively on the 1991 Census.
|
|
102
|
+
It uses support functions such as `download_data()`,
|
|
103
|
+
`download_geodata()`, and `download_administrative_boundaries()`.
|
|
104
|
+
The necessary folder structure is created automatically.
|
|
105
|
+
"""
|
|
106
|
+
selected_regions = [] if region_list is None else list(region_list)
|
|
107
|
+
|
|
108
|
+
# Make data folder
|
|
109
|
+
data_folder = output_data_folder.joinpath(PREPROCESSING_FOLDER)
|
|
110
|
+
Path(data_folder).mkdir(parents=True, exist_ok=True)
|
|
111
|
+
|
|
112
|
+
# Download data
|
|
113
|
+
download_data(output_data_folder=data_folder, census_year=1991)
|
|
114
|
+
|
|
115
|
+
# Download geodata
|
|
116
|
+
download_geodata(output_data_folder=data_folder, region_list=selected_regions, census_year=1991)
|
|
117
|
+
|
|
118
|
+
# Download administrative boundaries
|
|
119
|
+
download_administrative_boundaries(output_data_folder=data_folder, census_year=1991)
|
|
120
|
+
|
|
121
|
+
return output_data_folder
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
from istatcelldata.geodata import read_administrative_boundaries
|
|
7
|
+
|
|
8
|
+
# Define the logger as a global variable
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def add_administrative_info(
|
|
13
|
+
census_data: pd.DataFrame,
|
|
14
|
+
regions_data_path: Path,
|
|
15
|
+
regions_target_columns: list,
|
|
16
|
+
provinces_data_path: Path,
|
|
17
|
+
provinces_target_columns: list,
|
|
18
|
+
municipalities_data_path: Path,
|
|
19
|
+
municipalities_target_columns: list,
|
|
20
|
+
) -> pd.DataFrame:
|
|
21
|
+
"""Enrich census data with administrative information (municipalities, provinces, regions).
|
|
22
|
+
|
|
23
|
+
This function integrates corresponding administrative codes and names into the
|
|
24
|
+
census data, sourced from three external datasets: regional, provincial, and
|
|
25
|
+
municipal boundaries.
|
|
26
|
+
|
|
27
|
+
The logical workflow includes:
|
|
28
|
+
1. Standardization of census dataset column names.
|
|
29
|
+
2. Reading of administrative datasets (regions, provinces, municipalities).
|
|
30
|
+
3. Merge of municipalities with provinces.
|
|
31
|
+
4. Merge of result with regions.
|
|
32
|
+
5. Final join with census dataset on the municipal key (`PRO_COM`).
|
|
33
|
+
6. Cleanup and renaming of final administrative columns.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
census_data: Census dataset to which administrative information will be added.
|
|
37
|
+
regions_data_path: Path to the file containing regional data.
|
|
38
|
+
regions_target_columns: List of columns to extract from the regional dataset
|
|
39
|
+
(the first column is used as the index).
|
|
40
|
+
provinces_data_path: Path to the file containing provincial data.
|
|
41
|
+
provinces_target_columns: List of columns to extract from the provincial dataset
|
|
42
|
+
(the first column is used as the index).
|
|
43
|
+
municipalities_data_path: Path to the file containing municipal data.
|
|
44
|
+
municipalities_target_columns: List of columns to extract from the municipal
|
|
45
|
+
dataset (the first column is used as the index).
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
Census DataFrame enriched with administrative information on municipalities,
|
|
49
|
+
provinces, and regions.
|
|
50
|
+
|
|
51
|
+
Note:
|
|
52
|
+
Administrative codes used for merges are assumed to be:
|
|
53
|
+
`PRO_COM` (municipality), `COD_PROV`/`COD_PRO` (province), `COD_REG` (region).
|
|
54
|
+
The function uses `read_administrative_boundaries()` to load and filter
|
|
55
|
+
administrative datasets.
|
|
56
|
+
"""
|
|
57
|
+
logging.info("Starting to add administrative information to census data.")
|
|
58
|
+
|
|
59
|
+
# Convert census column names to uppercase for uniformity
|
|
60
|
+
census_data.columns = census_data.columns.str.upper()
|
|
61
|
+
logging.info("Census dataset column names converted to uppercase.")
|
|
62
|
+
|
|
63
|
+
# Read regional administrative boundaries
|
|
64
|
+
logging.info(f"Reading regional data from {regions_data_path}")
|
|
65
|
+
regions_data = read_administrative_boundaries(
|
|
66
|
+
file_path=regions_data_path,
|
|
67
|
+
target_columns=regions_target_columns,
|
|
68
|
+
index_column=regions_target_columns[0],
|
|
69
|
+
)
|
|
70
|
+
if isinstance(regions_data, Path):
|
|
71
|
+
raise ValueError("Expected DataFrame but got Path from read_administrative_boundaries")
|
|
72
|
+
regions_data.reset_index(inplace=True)
|
|
73
|
+
logging.info(f"Regional data read successfully. {len(regions_data)} records found.")
|
|
74
|
+
|
|
75
|
+
# Read provincial administrative boundaries
|
|
76
|
+
logging.info(f"Reading provincial data from {provinces_data_path}")
|
|
77
|
+
provinces_data = read_administrative_boundaries(
|
|
78
|
+
file_path=provinces_data_path,
|
|
79
|
+
target_columns=provinces_target_columns,
|
|
80
|
+
index_column=provinces_target_columns[0],
|
|
81
|
+
)
|
|
82
|
+
if isinstance(provinces_data, Path):
|
|
83
|
+
raise ValueError("Expected DataFrame but got Path from read_administrative_boundaries")
|
|
84
|
+
provinces_data.reset_index(inplace=True)
|
|
85
|
+
logging.info(f"Provincial data read successfully. {len(provinces_data)} records found.")
|
|
86
|
+
|
|
87
|
+
# Read municipal administrative boundaries
|
|
88
|
+
logging.info(f"Reading municipal data from {municipalities_data_path}")
|
|
89
|
+
municipalities_data = read_administrative_boundaries(
|
|
90
|
+
file_path=municipalities_data_path,
|
|
91
|
+
target_columns=municipalities_target_columns,
|
|
92
|
+
index_column=municipalities_target_columns[0],
|
|
93
|
+
)
|
|
94
|
+
if isinstance(municipalities_data, Path):
|
|
95
|
+
raise ValueError("Expected DataFrame but got Path from read_administrative_boundaries")
|
|
96
|
+
municipalities_data.reset_index(inplace=True)
|
|
97
|
+
logging.info(f"Municipal data read successfully. {len(municipalities_data)} records found.")
|
|
98
|
+
|
|
99
|
+
# Merge municipal data with provincial data
|
|
100
|
+
logging.info("Starting merge between municipal and provincial data.")
|
|
101
|
+
add_provinces = pd.merge(
|
|
102
|
+
left=municipalities_data, right=provinces_data, how="left", on="COD_PROV"
|
|
103
|
+
)
|
|
104
|
+
logging.info(
|
|
105
|
+
f"Merge between municipalities and provinces completed. "
|
|
106
|
+
f"{len(add_provinces)} resulting records."
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Merge resulting data with regional data
|
|
110
|
+
logging.info("Starting merge between municipal-provincial and regional data.")
|
|
111
|
+
add_regions = pd.merge(left=add_provinces, right=regions_data, how="left", on="COD_REG")
|
|
112
|
+
logging.info(
|
|
113
|
+
f"Merge between municipalities, provinces and regions completed. "
|
|
114
|
+
f"{len(add_regions)} resulting records."
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Final merge of census data with added administrative information
|
|
118
|
+
logging.info("Starting final merge with census data.")
|
|
119
|
+
add_municipalities = pd.merge(left=census_data, right=add_regions, how="left", on="PRO_COM")
|
|
120
|
+
logging.info(f"Final merge completed. {len(add_municipalities)} records in final dataset.")
|
|
121
|
+
columns_to_drop = [column for column in ["COD_PRO", "PRO_COM"] if column in add_municipalities]
|
|
122
|
+
add_municipalities.drop(columns=columns_to_drop, inplace=True)
|
|
123
|
+
add_municipalities.rename(
|
|
124
|
+
columns={
|
|
125
|
+
"COD_COM": "CODCOM",
|
|
126
|
+
"COD_PROV": "CODPRO",
|
|
127
|
+
"COD_REG": "CODREG",
|
|
128
|
+
"DEN_PROV": "PROVINCIA",
|
|
129
|
+
"DEN_REG": "REGIONE",
|
|
130
|
+
},
|
|
131
|
+
inplace=True,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
logging.info("Addition of administrative information completed successfully.")
|
|
135
|
+
|
|
136
|
+
return add_municipalities
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import xlrd
|
|
6
|
+
from tqdm import tqdm
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def read_xls(
|
|
10
|
+
file_path: Path,
|
|
11
|
+
census_code: str,
|
|
12
|
+
output_path: Path | None = None,
|
|
13
|
+
) -> pd.DataFrame | Path:
|
|
14
|
+
"""Read an Excel file (.xls) and return a DataFrame or save data as CSV.
|
|
15
|
+
|
|
16
|
+
This function opens an Excel file in `.xls` format, automatically selects
|
|
17
|
+
the first useful sheet (excluding any sheets named "Metadati"), extracts
|
|
18
|
+
the sheet rows, constructs a pandas DataFrame, and sets as index the column
|
|
19
|
+
corresponding to the provided census code.
|
|
20
|
+
|
|
21
|
+
If an output path is specified, the DataFrame is saved in CSV format;
|
|
22
|
+
otherwise, it is returned directly.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
file_path: Path to the Excel file to read.
|
|
26
|
+
census_code: Name of the column to use as the DataFrame index
|
|
27
|
+
(e.g., ISTAT municipality code).
|
|
28
|
+
output_path: Path to the folder where the resulting CSV will be saved.
|
|
29
|
+
If None, the DataFrame is returned without saving.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
A DataFrame containing data read from the Excel file if `output_path`
|
|
33
|
+
is None, or the path to the saved CSV file if `output_path` is specified.
|
|
34
|
+
|
|
35
|
+
Raises:
|
|
36
|
+
FileNotFoundError: If the specified file does not exist.
|
|
37
|
+
xlrd.XLRDError: If an error occurs while reading the Excel file.
|
|
38
|
+
Exception: For any unexpected error during parsing or saving.
|
|
39
|
+
"""
|
|
40
|
+
try:
|
|
41
|
+
logging.info(f"Lettura del file Excel da {file_path}")
|
|
42
|
+
|
|
43
|
+
# Legge il file Excel
|
|
44
|
+
read_data = xlrd.open_workbook(file_path)
|
|
45
|
+
|
|
46
|
+
# Estrae il nome del foglio, ignorando 'Metadati'
|
|
47
|
+
sheet_list = read_data.sheet_names()
|
|
48
|
+
if "Metadati" in sheet_list:
|
|
49
|
+
sheet_list.remove("Metadati")
|
|
50
|
+
sheet_name = sheet_list[0]
|
|
51
|
+
get_sheet = read_data.sheet_by_name(sheet_name)
|
|
52
|
+
|
|
53
|
+
# Estrae i dati dal foglio
|
|
54
|
+
dataset = []
|
|
55
|
+
for row_id in tqdm(range(get_sheet.nrows), desc="Lettura righe..."):
|
|
56
|
+
dataset.append(get_sheet.row_values(row_id))
|
|
57
|
+
|
|
58
|
+
# Crea il DataFrame
|
|
59
|
+
df_columns = [column_name.lower() for column_name in dataset[0]]
|
|
60
|
+
df_data = dataset[1:]
|
|
61
|
+
df = pd.DataFrame(data=df_data, columns=df_columns)
|
|
62
|
+
|
|
63
|
+
# Imposta il tipo di dati e l'indice
|
|
64
|
+
df = df.astype(int)
|
|
65
|
+
df.set_index(census_code, inplace=True)
|
|
66
|
+
df.sort_index(inplace=True)
|
|
67
|
+
|
|
68
|
+
# Se non viene fornito un percorso di output, restituisce il DataFrame
|
|
69
|
+
if output_path is None:
|
|
70
|
+
return df
|
|
71
|
+
else:
|
|
72
|
+
file_name = file_path.stem.split("\\")[1]
|
|
73
|
+
logging.info(f"Salvataggio dei dati in {output_path.joinpath(f'{file_name}.csv')}")
|
|
74
|
+
df.to_csv(path_or_buf=output_path.joinpath(f"{file_name}.csv"), sep=";")
|
|
75
|
+
return output_path.joinpath(
|
|
76
|
+
f"{file_name}.csv"
|
|
77
|
+
) # Restituisce il percorso del file CSV salvato
|
|
78
|
+
|
|
79
|
+
except FileNotFoundError as e:
|
|
80
|
+
logging.error(f"Excel file not found: {file_path}")
|
|
81
|
+
raise e
|
|
82
|
+
except xlrd.XLRDError as e:
|
|
83
|
+
logging.error(f"Error reading Excel file: {str(e)}")
|
|
84
|
+
raise e
|
|
85
|
+
except Exception as e:
|
|
86
|
+
logging.error(f"Error reading Excel file or saving data: {str(e)}")
|
|
87
|
+
raise e
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def census_trace(
|
|
91
|
+
file_path: Path, year: int, output_path: Path | None = None
|
|
92
|
+
) -> pd.DataFrame | Path:
|
|
93
|
+
"""Extract metadata trace record from the "Metadati" sheet of an Excel file.
|
|
94
|
+
|
|
95
|
+
This function accesses the sheet named **"Metadati"** in an Excel file
|
|
96
|
+
related to census data, extracts the fundamental columns (field name and
|
|
97
|
+
description), and constructs a pandas DataFrame with an index based on the
|
|
98
|
+
field name. If an output path is provided, the trace record is also saved
|
|
99
|
+
in CSV format.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
file_path: Path to the Excel file from which to extract metadata.
|
|
103
|
+
year: Reference year for the census, used to generate the output file name.
|
|
104
|
+
output_path: Path to the folder where the trace record CSV will be saved.
|
|
105
|
+
If None, a DataFrame is returned directly.
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
Path to the generated CSV file if `output_path` is provided, or a pandas
|
|
109
|
+
DataFrame containing the metadata trace record if `output_path` is None.
|
|
110
|
+
|
|
111
|
+
Raises:
|
|
112
|
+
FileNotFoundError: If the specified Excel file does not exist.
|
|
113
|
+
xlrd.XLRDError: If an error occurs while opening or reading the Excel file.
|
|
114
|
+
Exception: For any unexpected errors during parsing or saving.
|
|
115
|
+
"""
|
|
116
|
+
try:
|
|
117
|
+
logging.info(f"Lettura dei dati da {file_path}")
|
|
118
|
+
read_data = xlrd.open_workbook(file_path)
|
|
119
|
+
|
|
120
|
+
get_sheet = read_data.sheet_by_name("Metadati")
|
|
121
|
+
|
|
122
|
+
dataset = []
|
|
123
|
+
for row_id in range(get_sheet.nrows):
|
|
124
|
+
dataset.append(get_sheet.row_values(row_id)[:2])
|
|
125
|
+
dataset = dataset[7:] # Ignora le prime 7 righe
|
|
126
|
+
|
|
127
|
+
# Crea le colonne del DataFrame
|
|
128
|
+
df_columns = [column_name for column_name in dataset[0]]
|
|
129
|
+
|
|
130
|
+
# Crea i dati del DataFrame
|
|
131
|
+
df_data = dataset[1:]
|
|
132
|
+
df = pd.DataFrame(data=df_data, columns=df_columns)
|
|
133
|
+
df.set_index("NOME CAMPO", inplace=True)
|
|
134
|
+
|
|
135
|
+
logging.info("Dati letti con successo.")
|
|
136
|
+
|
|
137
|
+
if output_path is None:
|
|
138
|
+
return df
|
|
139
|
+
else:
|
|
140
|
+
file_name = f"tracciato_{year}_sezioni.csv"
|
|
141
|
+
file_path = output_path.joinpath(file_name)
|
|
142
|
+
logging.info(f"Salvataggio dei dati in {file_path}")
|
|
143
|
+
df.to_csv(path_or_buf=file_path, sep=";")
|
|
144
|
+
return file_path # Restituisce il percorso del file CSV salvato
|
|
145
|
+
|
|
146
|
+
except FileNotFoundError as e:
|
|
147
|
+
logging.error(f"Excel file not found: {file_path}")
|
|
148
|
+
raise e
|
|
149
|
+
except xlrd.XLRDError as e:
|
|
150
|
+
logging.error(f"Error reading Excel file: {str(e)}")
|
|
151
|
+
raise e
|
|
152
|
+
except Exception as e:
|
|
153
|
+
logging.error(f"Error reading Excel file or saving data: {str(e)}")
|
|
154
|
+
raise e
|
|
File without changes
|