istat-census-data 1.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. istat_census_data-1.4.1/LICENSE +21 -0
  2. istat_census_data-1.4.1/PKG-INFO +92 -0
  3. istat_census_data-1.4.1/README.md +55 -0
  4. istat_census_data-1.4.1/istatcelldata/__init__.py +49 -0
  5. istat_census_data-1.4.1/istatcelldata/census1991/__init__.py +0 -0
  6. istat_census_data-1.4.1/istatcelldata/census1991/download.py +121 -0
  7. istat_census_data-1.4.1/istatcelldata/census1991/process.py +136 -0
  8. istat_census_data-1.4.1/istatcelldata/census1991/utils.py +154 -0
  9. istat_census_data-1.4.1/istatcelldata/census2001/__init__.py +0 -0
  10. istat_census_data-1.4.1/istatcelldata/census2001/download.py +60 -0
  11. istat_census_data-1.4.1/istatcelldata/census2011/__init__.py +0 -0
  12. istat_census_data-1.4.1/istatcelldata/census2011/download.py +228 -0
  13. istat_census_data-1.4.1/istatcelldata/census2021/__init__.py +0 -0
  14. istat_census_data-1.4.1/istatcelldata/census2021/download.py +109 -0
  15. istat_census_data-1.4.1/istatcelldata/census2021/utils.py +62 -0
  16. istat_census_data-1.4.1/istatcelldata/config.py +303 -0
  17. istat_census_data-1.4.1/istatcelldata/data.py +154 -0
  18. istat_census_data-1.4.1/istatcelldata/download.py +87 -0
  19. istat_census_data-1.4.1/istatcelldata/executor/__init__.py +0 -0
  20. istat_census_data-1.4.1/istatcelldata/executor/download.py +76 -0
  21. istat_census_data-1.4.1/istatcelldata/executor/preprocess.py +189 -0
  22. istat_census_data-1.4.1/istatcelldata/executor/process.py +126 -0
  23. istat_census_data-1.4.1/istatcelldata/geodata.py +396 -0
  24. istat_census_data-1.4.1/istatcelldata/logger_config.py +106 -0
  25. istat_census_data-1.4.1/istatcelldata/utils.py +322 -0
  26. istat_census_data-1.4.1/pyproject.toml +109 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Massimiliano Moraca
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,92 @@
1
+ Metadata-Version: 2.4
2
+ Name: istat-census-data
3
+ Version: 1.4.1
4
+ Summary: Python package to download and process ISTAT census grid cell data for Italian population censuses (1991-2021)
5
+ License: MIT
6
+ License-File: LICENSE
7
+ Keywords: istat,census,gis,geospatial,italy,demographics,population
8
+ Author: Massimiliano Moraca
9
+ Author-email: gis.massimilianomoraca@gmail.com
10
+ Requires-Python: >=3.10,<4.0
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Natural Language :: English
16
+ Classifier: Natural Language :: Italian
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: 3.13
23
+ Classifier: Programming Language :: Python :: 3.14
24
+ Classifier: Topic :: Scientific/Engineering :: GIS
25
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
26
+ Requires-Dist: chardet (>=5.2.0,<6.0.0)
27
+ Requires-Dist: geopandas (>=1.0.1,<2.0.0)
28
+ Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
29
+ Requires-Dist: requests (>=2.32.0,<3.0.0)
30
+ Requires-Dist: tqdm (>=4.66.5,<5.0.0)
31
+ Requires-Dist: xlrd (>=2.0.1,<3.0.0)
32
+ Project-URL: Documentation, https://maxdragonheart.github.io/istat-census-data/
33
+ Project-URL: Homepage, https://maxdragonheart.github.io/istat-census-data/
34
+ Project-URL: Repository, https://github.com/MaxDragonheart/istat-census-data
35
+ Description-Content-Type: text/markdown
36
+
37
+ # ISTAT Cell Data
38
+
39
+ [![CI](https://github.com/MaxDragonheart/istat-census-data/actions/workflows/ci.yml/badge.svg)](https://github.com/MaxDragonheart/istat-census-data/actions/workflows/ci.yml)
40
+ [![Documentation](https://github.com/MaxDragonheart/istat-census-data/actions/workflows/docs.yml/badge.svg)](https://maxdragonheart.github.io/istat-census-data/)
41
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
42
+ [![PyPI version](https://badge.fury.io/py/istat-census-data.svg)](https://badge.fury.io/py/istat-census-data)
43
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
44
+
45
+ Con **ISTAT Cell Data** puoi ottenere facilmente il set di dati riferiti ai censimenti ISTAT in cui sono stati
46
+ rilasciati anche i dati delle celle censuarie.
47
+
48
+ E' possibile scaricare e processare i dati dal 1991 al 2021.
49
+
50
+ ## Installazione
51
+
52
+ ```bash
53
+ pip install istat-census-data
54
+ ```
55
+
56
+ ## Rinomina del progetto e manutenzione
57
+
58
+ `istat-census-data` è la distribuzione PyPI mantenuta a partire dalla versione 1.4.0
59
+ ed è la continuazione del precedente progetto `istatcelldata`.
60
+
61
+ Il nome della distribuzione PyPI è `istat-census-data`, mentre il package Python
62
+ da importare resta `istatcelldata`:
63
+
64
+ ```python
65
+ import istatcelldata
66
+ ```
67
+
68
+ La vecchia distribuzione/repository `istatcelldata` non è più mantenuta e non riceverà
69
+ nuove release. Aggiorna le dipendenze di installazione da `istatcelldata` a
70
+ `istat-census-data`; il codice Python esistente può continuare a usare
71
+ `import istatcelldata`.
72
+
73
+ ## Pubblicazione release
74
+
75
+ Le nuove release PyPI devono essere pubblicate da GitHub Actions tramite PyPI Trusted
76
+ Publishing. Il workflow di rilascio non usa token PyPI salvati in locale o nei GitHub
77
+ secrets.
78
+
79
+ Prima della prima pubblicazione, configura su PyPI un pending Trusted Publisher che
80
+ corrisponda al workflow di rilascio del repository e usa un environment GitHub protetto
81
+ per approvare la pubblicazione.
82
+
83
+ Per pubblicare, crea una GitHub Release sul tag che corrisponde alla versione in
84
+ `pyproject.toml`. Lo script `release.sh` resta solo una verifica locale pre-rilascio:
85
+ non pubblica più su PyPI e non effettua il deploy della documentazione.
86
+
87
+ !!! INFO
88
+
89
+ Questo progetto non è collegato ad ISTAT nè supportato da ISTAT ed è una iniziativa autonoma di [Massimiliano Moraca](https://massimilianomoraca.me/).
90
+
91
+ Questo repository è stato creato grazie a [MkDocs](https://www.mkdocs.org/), [Material for MkDocs](https://squidfunk.github.io/mkdocs-material) e [mkdocstring](https://mkdocstrings.github.io/).
92
+
@@ -0,0 +1,55 @@
1
+ # ISTAT Cell Data
2
+
3
+ [![CI](https://github.com/MaxDragonheart/istat-census-data/actions/workflows/ci.yml/badge.svg)](https://github.com/MaxDragonheart/istat-census-data/actions/workflows/ci.yml)
4
+ [![Documentation](https://github.com/MaxDragonheart/istat-census-data/actions/workflows/docs.yml/badge.svg)](https://maxdragonheart.github.io/istat-census-data/)
5
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
6
+ [![PyPI version](https://badge.fury.io/py/istat-census-data.svg)](https://badge.fury.io/py/istat-census-data)
7
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
8
+
9
+ Con **ISTAT Cell Data** puoi ottenere facilmente il set di dati riferiti ai censimenti ISTAT in cui sono stati
10
+ rilasciati anche i dati delle celle censuarie.
11
+
12
+ E' possibile scaricare e processare i dati dal 1991 al 2021.
13
+
14
+ ## Installazione
15
+
16
+ ```bash
17
+ pip install istat-census-data
18
+ ```
19
+
20
+ ## Rinomina del progetto e manutenzione
21
+
22
+ `istat-census-data` è la distribuzione PyPI mantenuta a partire dalla versione 1.4.0
23
+ ed è la continuazione del precedente progetto `istatcelldata`.
24
+
25
+ Il nome della distribuzione PyPI è `istat-census-data`, mentre il package Python
26
+ da importare resta `istatcelldata`:
27
+
28
+ ```python
29
+ import istatcelldata
30
+ ```
31
+
32
+ La vecchia distribuzione/repository `istatcelldata` non è più mantenuta e non riceverà
33
+ nuove release. Aggiorna le dipendenze di installazione da `istatcelldata` a
34
+ `istat-census-data`; il codice Python esistente può continuare a usare
35
+ `import istatcelldata`.
36
+
37
+ ## Pubblicazione release
38
+
39
+ Le nuove release PyPI devono essere pubblicate da GitHub Actions tramite PyPI Trusted
40
+ Publishing. Il workflow di rilascio non usa token PyPI salvati in locale o nei GitHub
41
+ secrets.
42
+
43
+ Prima della prima pubblicazione, configura su PyPI un pending Trusted Publisher che
44
+ corrisponda al workflow di rilascio del repository e usa un environment GitHub protetto
45
+ per approvare la pubblicazione.
46
+
47
+ Per pubblicare, crea una GitHub Release sul tag che corrisponde alla versione in
48
+ `pyproject.toml`. Lo script `release.sh` resta solo una verifica locale pre-rilascio:
49
+ non pubblica più su PyPI e non effettua il deploy della documentazione.
50
+
51
+ !!! INFO
52
+
53
+ Questo progetto non è collegato ad ISTAT nè supportato da ISTAT ed è una iniziativa autonoma di [Massimiliano Moraca](https://massimilianomoraca.me/).
54
+
55
+ Questo repository è stato creato grazie a [MkDocs](https://www.mkdocs.org/), [Material for MkDocs](https://squidfunk.github.io/mkdocs-material) e [mkdocstring](https://mkdocstrings.github.io/).
@@ -0,0 +1,49 @@
1
+ """ISTAT Cell Data - Italian census cell data processing library.
2
+
3
+ ISTAT Cell Data provides easy access to Italian National Institute of Statistics
4
+ (ISTAT) census data, including census grid cell datasets from complete population
5
+ censuses.
6
+
7
+ This library allows you to download and process census data from 1991 to 2021,
8
+ including demographic data and geographic information (shapefiles, administrative
9
+ boundaries) for census sections across Italy.
10
+ """
11
+
12
+ __version__ = "1.3.0"
13
+ __author__ = "Massimiliano Moraca"
14
+ __email__ = "gis.massimilianomoraca@gmail.com"
15
+
16
+ # Expose main modules for convenient imports
17
+ from istatcelldata import census1991, census2001, census2011, census2021, executor
18
+ from istatcelldata.config import DOWNLOAD_RAW_DATA, census_data
19
+ from istatcelldata.data import preprocess_data
20
+ from istatcelldata.download import download_base
21
+ from istatcelldata.executor.process import finalize_census_data
22
+ from istatcelldata.geodata import (
23
+ preprocess_geodata,
24
+ read_administrative_boundaries,
25
+ read_census,
26
+ )
27
+
28
+ __all__ = [
29
+ # Version info
30
+ "__version__",
31
+ "__author__",
32
+ "__email__",
33
+ # Submodules
34
+ "census1991",
35
+ "census2001",
36
+ "census2011",
37
+ "census2021",
38
+ "executor",
39
+ # Configuration
40
+ "census_data",
41
+ "DOWNLOAD_RAW_DATA",
42
+ # Main functions
43
+ "download_base",
44
+ "preprocess_data",
45
+ "read_administrative_boundaries",
46
+ "read_census",
47
+ "preprocess_geodata",
48
+ "finalize_census_data",
49
+ ]
@@ -0,0 +1,121 @@
1
+ import logging
2
+ from pathlib import Path
3
+
4
+ from istatcelldata.census1991.utils import census_trace, read_xls
5
+ from istatcelldata.census2011.download import download_administrative_boundaries, download_geodata
6
+ from istatcelldata.census2011.download import download_data as dwn
7
+ from istatcelldata.config import CENSUS_DATA_FOLDER, DATA_FOLDER, PREPROCESSING_FOLDER
8
+ from istatcelldata.utils import get_census_dictionary, remove_files
9
+
10
+ # Define the logger as a global variable
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def download_data(output_data_folder: Path, census_year: int) -> Path:
15
+ """Download, organize, and process census data for a specific year.
16
+
17
+ This function manages the complete workflow for acquiring census data from
18
+ source through to producing final CSV files. The following operations are
19
+ performed:
20
+
21
+ 1. Retrieval of the dictionary of links for the census year.
22
+ 2. Download of raw data via the `dwn()` function.
23
+ 3. Creation of the output folder structure.
24
+ 4. Identification and reading of `.xls` files.
25
+ 5. Conversion of Excel files to CSV.
26
+ 6. Extraction of tracking metadata (codifications) from the first available file.
27
+ 7. Removal of original Excel files.
28
+
29
+ Args:
30
+ output_data_folder: Root folder path where downloaded data will be saved.
31
+ census_year: Reference year for the census data to process.
32
+
33
+ Returns:
34
+ Path to the folder containing the downloaded and processed census data.
35
+
36
+ Raises:
37
+ Exception: If no `.xls` file is found in the data folder.
38
+
39
+ Note:
40
+ Conversion from XLS to CSV is performed via the `read_xls()` function.
41
+ Dataset tracking is performed only on the first XLS file found.
42
+ XLS files are removed at the end of the process to reduce disk space usage.
43
+ """
44
+ link_dict = get_census_dictionary(census_year=census_year)
45
+ census_code = link_dict[f"census{census_year}"]["census_code"]
46
+
47
+ data_folder = dwn(output_data_folder=output_data_folder, census_year=census_year)
48
+
49
+ final_folder = data_folder.joinpath(DATA_FOLDER, CENSUS_DATA_FOLDER)
50
+ Path(final_folder).mkdir(parents=True, exist_ok=True)
51
+
52
+ # Esegui il tracciamento dei dati dal primo file XLS trovato
53
+ files_list = list(data_folder.rglob("*.xls"))
54
+ if not files_list:
55
+ logging.error("Nessun file XLS trovato nella cartella dei dati.")
56
+ raise Exception("Nessun file XLS trovato per il tracciamento.")
57
+
58
+ logging.info("Estrazione dei dati censuari in formato xls e conversione in csv.")
59
+ # Convert xls to csv
60
+ for file_path in files_list:
61
+ read_xls(file_path=file_path, census_code=census_code, output_path=final_folder)
62
+
63
+ first_element = files_list[0]
64
+ logging.info(f"Extracting data trace from file {first_element}")
65
+ census_trace(file_path=first_element, year=census_year, output_path=final_folder)
66
+
67
+ # Remove unnecessary XLS files
68
+ logging.info(f"Removing XLS files from folder {data_folder}")
69
+ remove_files(files_path=files_list)
70
+
71
+ logging.info(f"Census data download completed and saved in {data_folder}")
72
+ return data_folder
73
+
74
+
75
+ def download_all_census_data_1991(
76
+ output_data_folder: Path, region_list: list[int] | None = None
77
+ ) -> Path:
78
+ """Download complete census and geographic dataset for the 1991 Census.
79
+
80
+ This function coordinates all necessary operations to obtain census data
81
+ and geographic information associated with the 1991 Census. It enables
82
+ downloading of:
83
+
84
+ - Tabular census data
85
+ - Geodata specific to one or more regions
86
+ - Official administrative boundaries
87
+
88
+ If no value is provided for `region_list`, geodata for all regions is
89
+ downloaded.
90
+
91
+ Args:
92
+ output_data_folder: Main path where all downloaded and processed data
93
+ will be saved.
94
+ region_list: List containing region codes or names for which to download
95
+ geodata. If empty, all available regions are considered.
96
+
97
+ Returns:
98
+ Path to the root folder containing the downloaded data.
99
+
100
+ Note:
101
+ This function operates exclusively on the 1991 Census.
102
+ It uses support functions such as `download_data()`,
103
+ `download_geodata()`, and `download_administrative_boundaries()`.
104
+ The necessary folder structure is created automatically.
105
+ """
106
+ selected_regions = [] if region_list is None else list(region_list)
107
+
108
+ # Make data folder
109
+ data_folder = output_data_folder.joinpath(PREPROCESSING_FOLDER)
110
+ Path(data_folder).mkdir(parents=True, exist_ok=True)
111
+
112
+ # Download data
113
+ download_data(output_data_folder=data_folder, census_year=1991)
114
+
115
+ # Download geodata
116
+ download_geodata(output_data_folder=data_folder, region_list=selected_regions, census_year=1991)
117
+
118
+ # Download administrative boundaries
119
+ download_administrative_boundaries(output_data_folder=data_folder, census_year=1991)
120
+
121
+ return output_data_folder
@@ -0,0 +1,136 @@
1
+ import logging
2
+ from pathlib import Path
3
+
4
+ import pandas as pd
5
+
6
+ from istatcelldata.geodata import read_administrative_boundaries
7
+
8
+ # Define the logger as a global variable
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ def add_administrative_info(
13
+ census_data: pd.DataFrame,
14
+ regions_data_path: Path,
15
+ regions_target_columns: list,
16
+ provinces_data_path: Path,
17
+ provinces_target_columns: list,
18
+ municipalities_data_path: Path,
19
+ municipalities_target_columns: list,
20
+ ) -> pd.DataFrame:
21
+ """Enrich census data with administrative information (municipalities, provinces, regions).
22
+
23
+ This function integrates corresponding administrative codes and names into the
24
+ census data, sourced from three external datasets: regional, provincial, and
25
+ municipal boundaries.
26
+
27
+ The logical workflow includes:
28
+ 1. Standardization of census dataset column names.
29
+ 2. Reading of administrative datasets (regions, provinces, municipalities).
30
+ 3. Merge of municipalities with provinces.
31
+ 4. Merge of result with regions.
32
+ 5. Final join with census dataset on the municipal key (`PRO_COM`).
33
+ 6. Cleanup and renaming of final administrative columns.
34
+
35
+ Args:
36
+ census_data: Census dataset to which administrative information will be added.
37
+ regions_data_path: Path to the file containing regional data.
38
+ regions_target_columns: List of columns to extract from the regional dataset
39
+ (the first column is used as the index).
40
+ provinces_data_path: Path to the file containing provincial data.
41
+ provinces_target_columns: List of columns to extract from the provincial dataset
42
+ (the first column is used as the index).
43
+ municipalities_data_path: Path to the file containing municipal data.
44
+ municipalities_target_columns: List of columns to extract from the municipal
45
+ dataset (the first column is used as the index).
46
+
47
+ Returns:
48
+ Census DataFrame enriched with administrative information on municipalities,
49
+ provinces, and regions.
50
+
51
+ Note:
52
+ Administrative codes used for merges are assumed to be:
53
+ `PRO_COM` (municipality), `COD_PROV`/`COD_PRO` (province), `COD_REG` (region).
54
+ The function uses `read_administrative_boundaries()` to load and filter
55
+ administrative datasets.
56
+ """
57
+ logging.info("Starting to add administrative information to census data.")
58
+
59
+ # Convert census column names to uppercase for uniformity
60
+ census_data.columns = census_data.columns.str.upper()
61
+ logging.info("Census dataset column names converted to uppercase.")
62
+
63
+ # Read regional administrative boundaries
64
+ logging.info(f"Reading regional data from {regions_data_path}")
65
+ regions_data = read_administrative_boundaries(
66
+ file_path=regions_data_path,
67
+ target_columns=regions_target_columns,
68
+ index_column=regions_target_columns[0],
69
+ )
70
+ if isinstance(regions_data, Path):
71
+ raise ValueError("Expected DataFrame but got Path from read_administrative_boundaries")
72
+ regions_data.reset_index(inplace=True)
73
+ logging.info(f"Regional data read successfully. {len(regions_data)} records found.")
74
+
75
+ # Read provincial administrative boundaries
76
+ logging.info(f"Reading provincial data from {provinces_data_path}")
77
+ provinces_data = read_administrative_boundaries(
78
+ file_path=provinces_data_path,
79
+ target_columns=provinces_target_columns,
80
+ index_column=provinces_target_columns[0],
81
+ )
82
+ if isinstance(provinces_data, Path):
83
+ raise ValueError("Expected DataFrame but got Path from read_administrative_boundaries")
84
+ provinces_data.reset_index(inplace=True)
85
+ logging.info(f"Provincial data read successfully. {len(provinces_data)} records found.")
86
+
87
+ # Read municipal administrative boundaries
88
+ logging.info(f"Reading municipal data from {municipalities_data_path}")
89
+ municipalities_data = read_administrative_boundaries(
90
+ file_path=municipalities_data_path,
91
+ target_columns=municipalities_target_columns,
92
+ index_column=municipalities_target_columns[0],
93
+ )
94
+ if isinstance(municipalities_data, Path):
95
+ raise ValueError("Expected DataFrame but got Path from read_administrative_boundaries")
96
+ municipalities_data.reset_index(inplace=True)
97
+ logging.info(f"Municipal data read successfully. {len(municipalities_data)} records found.")
98
+
99
+ # Merge municipal data with provincial data
100
+ logging.info("Starting merge between municipal and provincial data.")
101
+ add_provinces = pd.merge(
102
+ left=municipalities_data, right=provinces_data, how="left", on="COD_PROV"
103
+ )
104
+ logging.info(
105
+ f"Merge between municipalities and provinces completed. "
106
+ f"{len(add_provinces)} resulting records."
107
+ )
108
+
109
+ # Merge resulting data with regional data
110
+ logging.info("Starting merge between municipal-provincial and regional data.")
111
+ add_regions = pd.merge(left=add_provinces, right=regions_data, how="left", on="COD_REG")
112
+ logging.info(
113
+ f"Merge between municipalities, provinces and regions completed. "
114
+ f"{len(add_regions)} resulting records."
115
+ )
116
+
117
+ # Final merge of census data with added administrative information
118
+ logging.info("Starting final merge with census data.")
119
+ add_municipalities = pd.merge(left=census_data, right=add_regions, how="left", on="PRO_COM")
120
+ logging.info(f"Final merge completed. {len(add_municipalities)} records in final dataset.")
121
+ columns_to_drop = [column for column in ["COD_PRO", "PRO_COM"] if column in add_municipalities]
122
+ add_municipalities.drop(columns=columns_to_drop, inplace=True)
123
+ add_municipalities.rename(
124
+ columns={
125
+ "COD_COM": "CODCOM",
126
+ "COD_PROV": "CODPRO",
127
+ "COD_REG": "CODREG",
128
+ "DEN_PROV": "PROVINCIA",
129
+ "DEN_REG": "REGIONE",
130
+ },
131
+ inplace=True,
132
+ )
133
+
134
+ logging.info("Addition of administrative information completed successfully.")
135
+
136
+ return add_municipalities
@@ -0,0 +1,154 @@
1
+ import logging
2
+ from pathlib import Path
3
+
4
+ import pandas as pd
5
+ import xlrd
6
+ from tqdm import tqdm
7
+
8
+
9
+ def read_xls(
10
+ file_path: Path,
11
+ census_code: str,
12
+ output_path: Path | None = None,
13
+ ) -> pd.DataFrame | Path:
14
+ """Read an Excel file (.xls) and return a DataFrame or save data as CSV.
15
+
16
+ This function opens an Excel file in `.xls` format, automatically selects
17
+ the first useful sheet (excluding any sheets named "Metadati"), extracts
18
+ the sheet rows, constructs a pandas DataFrame, and sets as index the column
19
+ corresponding to the provided census code.
20
+
21
+ If an output path is specified, the DataFrame is saved in CSV format;
22
+ otherwise, it is returned directly.
23
+
24
+ Args:
25
+ file_path: Path to the Excel file to read.
26
+ census_code: Name of the column to use as the DataFrame index
27
+ (e.g., ISTAT municipality code).
28
+ output_path: Path to the folder where the resulting CSV will be saved.
29
+ If None, the DataFrame is returned without saving.
30
+
31
+ Returns:
32
+ A DataFrame containing data read from the Excel file if `output_path`
33
+ is None, or the path to the saved CSV file if `output_path` is specified.
34
+
35
+ Raises:
36
+ FileNotFoundError: If the specified file does not exist.
37
+ xlrd.XLRDError: If an error occurs while reading the Excel file.
38
+ Exception: For any unexpected error during parsing or saving.
39
+ """
40
+ try:
41
+ logging.info(f"Lettura del file Excel da {file_path}")
42
+
43
+ # Legge il file Excel
44
+ read_data = xlrd.open_workbook(file_path)
45
+
46
+ # Estrae il nome del foglio, ignorando 'Metadati'
47
+ sheet_list = read_data.sheet_names()
48
+ if "Metadati" in sheet_list:
49
+ sheet_list.remove("Metadati")
50
+ sheet_name = sheet_list[0]
51
+ get_sheet = read_data.sheet_by_name(sheet_name)
52
+
53
+ # Estrae i dati dal foglio
54
+ dataset = []
55
+ for row_id in tqdm(range(get_sheet.nrows), desc="Lettura righe..."):
56
+ dataset.append(get_sheet.row_values(row_id))
57
+
58
+ # Crea il DataFrame
59
+ df_columns = [column_name.lower() for column_name in dataset[0]]
60
+ df_data = dataset[1:]
61
+ df = pd.DataFrame(data=df_data, columns=df_columns)
62
+
63
+ # Imposta il tipo di dati e l'indice
64
+ df = df.astype(int)
65
+ df.set_index(census_code, inplace=True)
66
+ df.sort_index(inplace=True)
67
+
68
+ # Se non viene fornito un percorso di output, restituisce il DataFrame
69
+ if output_path is None:
70
+ return df
71
+ else:
72
+ file_name = file_path.stem.split("\\")[1]
73
+ logging.info(f"Salvataggio dei dati in {output_path.joinpath(f'{file_name}.csv')}")
74
+ df.to_csv(path_or_buf=output_path.joinpath(f"{file_name}.csv"), sep=";")
75
+ return output_path.joinpath(
76
+ f"{file_name}.csv"
77
+ ) # Restituisce il percorso del file CSV salvato
78
+
79
+ except FileNotFoundError as e:
80
+ logging.error(f"Excel file not found: {file_path}")
81
+ raise e
82
+ except xlrd.XLRDError as e:
83
+ logging.error(f"Error reading Excel file: {str(e)}")
84
+ raise e
85
+ except Exception as e:
86
+ logging.error(f"Error reading Excel file or saving data: {str(e)}")
87
+ raise e
88
+
89
+
90
+ def census_trace(
91
+ file_path: Path, year: int, output_path: Path | None = None
92
+ ) -> pd.DataFrame | Path:
93
+ """Extract metadata trace record from the "Metadati" sheet of an Excel file.
94
+
95
+ This function accesses the sheet named **"Metadati"** in an Excel file
96
+ related to census data, extracts the fundamental columns (field name and
97
+ description), and constructs a pandas DataFrame with an index based on the
98
+ field name. If an output path is provided, the trace record is also saved
99
+ in CSV format.
100
+
101
+ Args:
102
+ file_path: Path to the Excel file from which to extract metadata.
103
+ year: Reference year for the census, used to generate the output file name.
104
+ output_path: Path to the folder where the trace record CSV will be saved.
105
+ If None, a DataFrame is returned directly.
106
+
107
+ Returns:
108
+ Path to the generated CSV file if `output_path` is provided, or a pandas
109
+ DataFrame containing the metadata trace record if `output_path` is None.
110
+
111
+ Raises:
112
+ FileNotFoundError: If the specified Excel file does not exist.
113
+ xlrd.XLRDError: If an error occurs while opening or reading the Excel file.
114
+ Exception: For any unexpected errors during parsing or saving.
115
+ """
116
+ try:
117
+ logging.info(f"Lettura dei dati da {file_path}")
118
+ read_data = xlrd.open_workbook(file_path)
119
+
120
+ get_sheet = read_data.sheet_by_name("Metadati")
121
+
122
+ dataset = []
123
+ for row_id in range(get_sheet.nrows):
124
+ dataset.append(get_sheet.row_values(row_id)[:2])
125
+ dataset = dataset[7:] # Ignora le prime 7 righe
126
+
127
+ # Crea le colonne del DataFrame
128
+ df_columns = [column_name for column_name in dataset[0]]
129
+
130
+ # Crea i dati del DataFrame
131
+ df_data = dataset[1:]
132
+ df = pd.DataFrame(data=df_data, columns=df_columns)
133
+ df.set_index("NOME CAMPO", inplace=True)
134
+
135
+ logging.info("Dati letti con successo.")
136
+
137
+ if output_path is None:
138
+ return df
139
+ else:
140
+ file_name = f"tracciato_{year}_sezioni.csv"
141
+ file_path = output_path.joinpath(file_name)
142
+ logging.info(f"Salvataggio dei dati in {file_path}")
143
+ df.to_csv(path_or_buf=file_path, sep=";")
144
+ return file_path # Restituisce il percorso del file CSV salvato
145
+
146
+ except FileNotFoundError as e:
147
+ logging.error(f"Excel file not found: {file_path}")
148
+ raise e
149
+ except xlrd.XLRDError as e:
150
+ logging.error(f"Error reading Excel file: {str(e)}")
151
+ raise e
152
+ except Exception as e:
153
+ logging.error(f"Error reading Excel file or saving data: {str(e)}")
154
+ raise e