bopa-fetcher 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bopa_fetcher-0.1.0/LICENSE +21 -0
- bopa_fetcher-0.1.0/MANIFEST.in +3 -0
- bopa_fetcher-0.1.0/PKG-INFO +120 -0
- bopa_fetcher-0.1.0/README.md +81 -0
- bopa_fetcher-0.1.0/pyproject.toml +75 -0
- bopa_fetcher-0.1.0/setup.cfg +4 -0
- bopa_fetcher-0.1.0/src/bopa/__init__.py +15 -0
- bopa_fetcher-0.1.0/src/bopa/api/__init__.py +3 -0
- bopa_fetcher-0.1.0/src/bopa/api/client.py +157 -0
- bopa_fetcher-0.1.0/src/bopa/constants.py +15 -0
- bopa_fetcher-0.1.0/src/bopa/models/__init__.py +4 -0
- bopa_fetcher-0.1.0/src/bopa/models/article.py +42 -0
- bopa_fetcher-0.1.0/src/bopa/models/bulletin.py +67 -0
- bopa_fetcher-0.1.0/src/bopa/service/__init__.py +11 -0
- bopa_fetcher-0.1.0/src/bopa/service/article.py +122 -0
- bopa_fetcher-0.1.0/src/bopa/service/bulletin.py +190 -0
- bopa_fetcher-0.1.0/src/bopa/service/links.py +48 -0
- bopa_fetcher-0.1.0/src/bopa_fetcher.egg-info/PKG-INFO +120 -0
- bopa_fetcher-0.1.0/src/bopa_fetcher.egg-info/SOURCES.txt +25 -0
- bopa_fetcher-0.1.0/src/bopa_fetcher.egg-info/dependency_links.txt +1 -0
- bopa_fetcher-0.1.0/src/bopa_fetcher.egg-info/requires.txt +18 -0
- bopa_fetcher-0.1.0/src/bopa_fetcher.egg-info/top_level.txt +1 -0
- bopa_fetcher-0.1.0/tests/test_article_service.py +137 -0
- bopa_fetcher-0.1.0/tests/test_bulletin_service.py +312 -0
- bopa_fetcher-0.1.0/tests/test_client.py +335 -0
- bopa_fetcher-0.1.0/tests/test_links.py +36 -0
- bopa_fetcher-0.1.0/tests/test_models.py +121 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Diego González Suárez and contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: bopa-fetcher
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A Python library for fetching structured information from BOPA (Boletín Oficial de la Principidad de Asturias) webpage.
|
|
5
|
+
Author-email: Diego González Suárez <gonzalezsdiego@uniovi.es>, Noelia Rico <noeliarico@uniovi.es>
|
|
6
|
+
Maintainer-email: Diego González Suárez <gonzalezsdiego@uniovi.es>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Project-URL: Homepage, https://diegoglezsu.github.io/bopa-fetcher
|
|
9
|
+
Project-URL: Documentation, https://diegoglezsu.github.io/bopa-fetcher
|
|
10
|
+
Project-URL: Repository, https://github.com/diegoglezsu/bopa-fetcher.git
|
|
11
|
+
Project-URL: Issues, https://github.com/diegoglezsu/bopa-fetcher/issues
|
|
12
|
+
Keywords: official gazette,asturian legislation,data analysis,policy research,legal analytics
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Intended Audience :: Legal Industry
|
|
16
|
+
Classifier: Intended Audience :: Science/Research
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
18
|
+
Classifier: Topic :: Text Processing :: General
|
|
19
|
+
Requires-Python: >=3.7
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: bs4==0.0.2
|
|
23
|
+
Requires-Dist: matplotlib>=3.9.0
|
|
24
|
+
Requires-Dist: numpy>=2.1.0
|
|
25
|
+
Requires-Dist: pandas>=2.2.2
|
|
26
|
+
Requires-Dist: requests>=2.33.0
|
|
27
|
+
Requires-Dist: seaborn>=0.13.2
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pytest>=6.0; extra == "dev"
|
|
30
|
+
Requires-Dist: pytest-cov>=2.0; extra == "dev"
|
|
31
|
+
Requires-Dist: black>=21.0; extra == "dev"
|
|
32
|
+
Requires-Dist: flake8>=3.9; extra == "dev"
|
|
33
|
+
Requires-Dist: mypy>=0.900; extra == "dev"
|
|
34
|
+
Provides-Extra: docs
|
|
35
|
+
Requires-Dist: mkdocs<2.0,>=1.6; extra == "docs"
|
|
36
|
+
Requires-Dist: mkdocs-material>=9.5; extra == "docs"
|
|
37
|
+
Requires-Dist: mkdocstrings[python]>=0.25; extra == "docs"
|
|
38
|
+
Dynamic: license-file
|
|
39
|
+
|
|
40
|
+
# bopa-fetcher
|
|
41
|
+
|
|
42
|
+
[](https://github.com/diegoglezsu/bopa-fetcher/actions/workflows/tests.yml)
|
|
43
|
+
[](https://github.com/diegoglezsu/bopa-fetcher/actions/workflows/github-code-scanning/codeql)
|
|
44
|
+
[](https://sonarcloud.io/summary/new_code?id=diegoglezsu_bopa-fetcher)
|
|
45
|
+
[](https://app.codecov.io/github/diegoglezsu/bopa-fetcher)
|
|
46
|
+
[](https://pypi.org/project/bopa-fetcher/)
|
|
47
|
+
[](https://diegoglezsu.github.io/bopa-fetcher/)
|
|
48
|
+
[]()
|
|
49
|
+
|
|
50
|
+
## Description
|
|
51
|
+
|
|
52
|
+

|
|
53
|
+
|
|
54
|
+
**bopa-fetcher** is a Python library for programmatic access to the official bulletins of the Principality of Asturias (BOPA). It allows users to search, retrieve, and analyze bulletin summaries and individual articles in a structured manner.
|
|
55
|
+
|
|
56
|
+
## Why bopa-fetcher?
|
|
57
|
+
|
|
58
|
+
[BOPA (Boletín Oficial del Principado de Asturias)](https://miprincipado.asturias.es/bopa) is the official gazette of the region of Asturias, Spain. Researchers, legal professionals, and journalists often need to search, download, and analyze large volumes of legislative and administrative documents. **bopa-fetcher** provides a simple, programmatic interface to:
|
|
59
|
+
|
|
60
|
+
- Retrieve bulletin summaries and articles as structured Python objects.
|
|
61
|
+
- Search across date ranges for both bulletins and individual articles.
|
|
62
|
+
- Export data to dictionaries for integration with data analysis pipelines (pandas, NumPy, etc.).
|
|
63
|
+
- Avoid manual scraping by handling HTML parsing and URL construction internally.
|
|
64
|
+
|
|
65
|
+
> [!WARNING]
|
|
66
|
+
> BOPA bulletins are available in the portal from **01/01/2000** onwards. Requests for earlier dates will return no data.
|
|
67
|
+
|
|
68
|
+
## Main features
|
|
69
|
+
|
|
70
|
+
- **Legal research**: Download and analyze official bulletins for a specific time period to track legislative changes.
|
|
71
|
+
- **Data journalism**: Collect structured data from BOPA for investigative reporting on regional governance.
|
|
72
|
+
- **Policy analysis**: Extract and categorize dispositions by origin (council, council board, presidency, etc.) for quantitative studies.
|
|
73
|
+
- **Archive building**: Build reproducible datasets of Asturian official publications for academic research.
|
|
74
|
+
|
|
75
|
+
## Quick Start
|
|
76
|
+
|
|
77
|
+
### Installation
|
|
78
|
+
|
|
79
|
+
Install from PyPI:
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
pip install bopa-fetcher
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### Basic Usage Example
|
|
86
|
+
|
|
87
|
+
Fetch acts for a publication date:
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from bopa.api import Client
|
|
91
|
+
client = Client()
|
|
92
|
+
|
|
93
|
+
# Get the bulletin summary for 29/12/2023
|
|
94
|
+
bulletin = client.get_bulletin(date="29/12/2023")
|
|
95
|
+
print(bulletin.to_dict())
|
|
96
|
+
|
|
97
|
+
# get specific article by code
|
|
98
|
+
article = client.get_article(cod="2023-11737", date="29/12/2023")
|
|
99
|
+
print(f" article: {article.to_dict()}")
|
|
100
|
+
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Use Case Examples
|
|
104
|
+
|
|
105
|
+
The repository includes runnable scripts with examples and use cases of the library. These scripts can be found in the `scripts/` directory.
|
|
106
|
+
|
|
107
|
+
## License
|
|
108
|
+
|
|
109
|
+
This project is licensed under the MIT License. See the `LICENSE` file for details.
|
|
110
|
+
|
|
111
|
+
## Contact
|
|
112
|
+
|
|
113
|
+
For any questions or suggestions, feel free to reach out to the corresponding author:
|
|
114
|
+
|
|
115
|
+
- **Author**: Diego González Suárez, Noelia Rico Pachón
|
|
116
|
+
- **Email**: <gonzalezsdiego@uniovi.es>, <noeliarico@uniovi.es>
|
|
117
|
+
|
|
118
|
+
## Acknowledgements
|
|
119
|
+
|
|
120
|
+
## Citation
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# bopa-fetcher
|
|
2
|
+
|
|
3
|
+
[](https://github.com/diegoglezsu/bopa-fetcher/actions/workflows/tests.yml)
|
|
4
|
+
[](https://github.com/diegoglezsu/bopa-fetcher/actions/workflows/github-code-scanning/codeql)
|
|
5
|
+
[](https://sonarcloud.io/summary/new_code?id=diegoglezsu_bopa-fetcher)
|
|
6
|
+
[](https://app.codecov.io/github/diegoglezsu/bopa-fetcher)
|
|
7
|
+
[](https://pypi.org/project/bopa-fetcher/)
|
|
8
|
+
[](https://diegoglezsu.github.io/bopa-fetcher/)
|
|
9
|
+
[]()
|
|
10
|
+
|
|
11
|
+
## Description
|
|
12
|
+
|
|
13
|
+

|
|
14
|
+
|
|
15
|
+
**bopa-fetcher** is a Python library for programmatic access to the official bulletins of the Principality of Asturias (BOPA). It allows users to search, retrieve, and analyze bulletin summaries and individual articles in a structured manner.
|
|
16
|
+
|
|
17
|
+
## Why bopa-fetcher?
|
|
18
|
+
|
|
19
|
+
[BOPA (Boletín Oficial del Principado de Asturias)](https://miprincipado.asturias.es/bopa) is the official gazette of the region of Asturias, Spain. Researchers, legal professionals, and journalists often need to search, download, and analyze large volumes of legislative and administrative documents. **bopa-fetcher** provides a simple, programmatic interface to:
|
|
20
|
+
|
|
21
|
+
- Retrieve bulletin summaries and articles as structured Python objects.
|
|
22
|
+
- Search across date ranges for both bulletins and individual articles.
|
|
23
|
+
- Export data to dictionaries for integration with data analysis pipelines (pandas, NumPy, etc.).
|
|
24
|
+
- Avoid manual scraping by handling HTML parsing and URL construction internally.
|
|
25
|
+
|
|
26
|
+
> [!WARNING]
|
|
27
|
+
> BOPA bulletins are available in the portal from **01/01/2000** onwards. Requests for earlier dates will return no data.
|
|
28
|
+
|
|
29
|
+
## Main features
|
|
30
|
+
|
|
31
|
+
- **Legal research**: Download and analyze official bulletins for a specific time period to track legislative changes.
|
|
32
|
+
- **Data journalism**: Collect structured data from BOPA for investigative reporting on regional governance.
|
|
33
|
+
- **Policy analysis**: Extract and categorize dispositions by origin (council, council board, presidency, etc.) for quantitative studies.
|
|
34
|
+
- **Archive building**: Build reproducible datasets of Asturian official publications for academic research.
|
|
35
|
+
|
|
36
|
+
## Quick Start
|
|
37
|
+
|
|
38
|
+
### Installation
|
|
39
|
+
|
|
40
|
+
Install from PyPI:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install bopa-fetcher
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### Basic Usage Example
|
|
47
|
+
|
|
48
|
+
Fetch acts for a publication date:
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
from bopa.api import Client
|
|
52
|
+
client = Client()
|
|
53
|
+
|
|
54
|
+
# Get the bulletin summary for 29/12/2023
|
|
55
|
+
bulletin = client.get_bulletin(date="29/12/2023")
|
|
56
|
+
print(bulletin.to_dict())
|
|
57
|
+
|
|
58
|
+
# get specific article by code
|
|
59
|
+
article = client.get_article(cod="2023-11737", date="29/12/2023")
|
|
60
|
+
print(f" article: {article.to_dict()}")
|
|
61
|
+
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Use Case Examples
|
|
65
|
+
|
|
66
|
+
The repository includes runnable scripts with examples and use cases of the library. These scripts can be found in the `scripts/` directory.
|
|
67
|
+
|
|
68
|
+
## License
|
|
69
|
+
|
|
70
|
+
This project is licensed under the MIT License. See the `LICENSE` file for details.
|
|
71
|
+
|
|
72
|
+
## Contact
|
|
73
|
+
|
|
74
|
+
For any questions or suggestions, feel free to reach out to the corresponding author:
|
|
75
|
+
|
|
76
|
+
- **Author**: Diego González Suárez, Noelia Rico Pachón
|
|
77
|
+
- **Email**: <gonzalezsdiego@uniovi.es>, <noeliarico@uniovi.es>
|
|
78
|
+
|
|
79
|
+
## Acknowledgements
|
|
80
|
+
|
|
81
|
+
## Citation
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "bopa-fetcher"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A Python library for fetching structured information from BOPA (Boletín Oficial de la Principidad de Asturias) webpage."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.7"
|
|
11
|
+
license-files = ["LICENSE"]
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Diego González Suárez", email = "gonzalezsdiego@uniovi.es"},
|
|
14
|
+
{name = "Noelia Rico", email = "noeliarico@uniovi.es"}
|
|
15
|
+
]
|
|
16
|
+
maintainers = [
|
|
17
|
+
{name = "Diego González Suárez", email = "gonzalezsdiego@uniovi.es"},
|
|
18
|
+
]
|
|
19
|
+
keywords = ["official gazette", "asturian legislation", "data analysis", "policy research", "legal analytics"]
|
|
20
|
+
classifiers = [
|
|
21
|
+
"Programming Language :: Python :: 3",
|
|
22
|
+
"Operating System :: OS Independent",
|
|
23
|
+
"Intended Audience :: Legal Industry",
|
|
24
|
+
"Intended Audience :: Science/Research",
|
|
25
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
26
|
+
"Topic :: Text Processing :: General",
|
|
27
|
+
]
|
|
28
|
+
license = "MIT"
|
|
29
|
+
dependencies = [
|
|
30
|
+
"bs4==0.0.2",
|
|
31
|
+
"matplotlib>=3.9.0",
|
|
32
|
+
"numpy>=2.1.0",
|
|
33
|
+
"pandas>=2.2.2",
|
|
34
|
+
"requests>=2.33.0",
|
|
35
|
+
"seaborn>=0.13.2"
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
[project.optional-dependencies]
|
|
40
|
+
dev = [
|
|
41
|
+
"pytest>=6.0",
|
|
42
|
+
"pytest-cov>=2.0",
|
|
43
|
+
"black>=21.0",
|
|
44
|
+
"flake8>=3.9",
|
|
45
|
+
"mypy>=0.900",
|
|
46
|
+
]
|
|
47
|
+
docs = [
|
|
48
|
+
"mkdocs>=1.6,<2.0",
|
|
49
|
+
"mkdocs-material>=9.5",
|
|
50
|
+
"mkdocstrings[python]>=0.25",
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
[project.urls]
|
|
54
|
+
Homepage = "https://diegoglezsu.github.io/bopa-fetcher"
|
|
55
|
+
Documentation = "https://diegoglezsu.github.io/bopa-fetcher"
|
|
56
|
+
Repository = "https://github.com/diegoglezsu/bopa-fetcher.git"
|
|
57
|
+
Issues = "https://github.com/diegoglezsu/bopa-fetcher/issues"
|
|
58
|
+
|
|
59
|
+
[tool.setuptools.packages.find]
|
|
60
|
+
where = ["src"]
|
|
61
|
+
|
|
62
|
+
[tool.black]
|
|
63
|
+
line-length = 88
|
|
64
|
+
target-version = ["py37"]
|
|
65
|
+
|
|
66
|
+
[tool.isort]
|
|
67
|
+
profile = "black"
|
|
68
|
+
line_length = 88
|
|
69
|
+
|
|
70
|
+
[tool.mypy]
|
|
71
|
+
python_version = "3.9"
|
|
72
|
+
warn_return_any = true
|
|
73
|
+
warn_unused_configs = true
|
|
74
|
+
disallow_untyped_defs = true
|
|
75
|
+
ignore_missing_imports = true
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""bopa-fetcher: A Python library for fetching structured information from BOPA.
|
|
2
|
+
|
|
3
|
+
BOPA (Boletín Oficial del Principado de Asturias) is the official gazette
|
|
4
|
+
of the region of Asturias, Spain. This library provides programmatic access
|
|
5
|
+
to its bulletins and articles via web scraping.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
__author__ = ["Diego González Suárez", "Noelia Rico"]
|
|
9
|
+
__email__ = ["gonzalezsdiego@uniovi.es", "noeliarico@uniovi.es"]
|
|
10
|
+
__version__ = "0.1.0"
|
|
11
|
+
|
|
12
|
+
from . import service
|
|
13
|
+
from . import api
|
|
14
|
+
from . import models
|
|
15
|
+
from . import constants
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
from datetime import datetime, timedelta
|
|
2
|
+
|
|
3
|
+
from bopa.constants import DATE_MIN
|
|
4
|
+
|
|
5
|
+
from ..models import BulletinArticle, BulletinSummary
|
|
6
|
+
from ..service.article import Article
|
|
7
|
+
from ..service.bulletin import Bulletin
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Client:
|
|
11
|
+
"""A client class to interact with the BOPA API and fetch bulletins and articles."""
|
|
12
|
+
|
|
13
|
+
def get_bulletin(
|
|
14
|
+
self,
|
|
15
|
+
date: str,
|
|
16
|
+
text_contains: str | None = None,
|
|
17
|
+
origin_contains: str | None = None,
|
|
18
|
+
) -> BulletinSummary:
|
|
19
|
+
"""Fetch the bulletin summary for a specific date.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
date: Date in dd/mm/YYYY format (e.g. "29/12/2023").
|
|
23
|
+
text_contains: Optional string to filter entries by.
|
|
24
|
+
origin_contains: Optional string to filter entries by origin.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
BulletinSummary corresponding to the given date.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
if not date:
|
|
31
|
+
raise ValueError("The 'date' parameter is required.")
|
|
32
|
+
|
|
33
|
+
if datetime.strptime(date, "%d/%m/%Y") < DATE_MIN:
|
|
34
|
+
raise ValueError(f"date must be on or after {DATE_MIN}.")
|
|
35
|
+
|
|
36
|
+
return Bulletin(date=date).get_bulletin(
|
|
37
|
+
text_contains=text_contains, origin_contains=origin_contains
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
def get_bulletins(
|
|
41
|
+
self,
|
|
42
|
+
date_from: str,
|
|
43
|
+
date_to: str,
|
|
44
|
+
text_contains: str | None = None,
|
|
45
|
+
origin_contains: str | None = None,
|
|
46
|
+
) -> list[BulletinSummary]:
|
|
47
|
+
"""Fetch all bulletin summaries in a date range.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
date_from: Start date in dd/mm/YYYY format.
|
|
51
|
+
date_to: End date in dd/mm/YYYY format.
|
|
52
|
+
text_contains: Optional string to filter entries by.
|
|
53
|
+
origin_contains: Optional string to filter entries by origin.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
List of BulletinSummary objects for each weekday in the range.
|
|
57
|
+
"""
|
|
58
|
+
date_from = datetime.strptime(date_from, "%d/%m/%Y")
|
|
59
|
+
date_to = datetime.strptime(date_to, "%d/%m/%Y")
|
|
60
|
+
|
|
61
|
+
if date_from > date_to:
|
|
62
|
+
raise ValueError("date_from must be earlier than or equal to date_to.")
|
|
63
|
+
|
|
64
|
+
if date_from < DATE_MIN:
|
|
65
|
+
raise ValueError(f"date_from must be on or after {DATE_MIN}.")
|
|
66
|
+
|
|
67
|
+
summaries = []
|
|
68
|
+
current_date = date_from
|
|
69
|
+
while current_date <= date_to:
|
|
70
|
+
fecha_str = current_date.strftime("%d/%m/%Y")
|
|
71
|
+
try:
|
|
72
|
+
summaries.append(
|
|
73
|
+
self.get_bulletin(
|
|
74
|
+
fecha_str,
|
|
75
|
+
text_contains=text_contains,
|
|
76
|
+
origin_contains=origin_contains,
|
|
77
|
+
)
|
|
78
|
+
)
|
|
79
|
+
except Exception:
|
|
80
|
+
pass
|
|
81
|
+
current_date += timedelta(days=1)
|
|
82
|
+
|
|
83
|
+
return summaries
|
|
84
|
+
|
|
85
|
+
def get_article(
|
|
86
|
+
self,
|
|
87
|
+
cod: str,
|
|
88
|
+
date: str,
|
|
89
|
+
) -> BulletinArticle:
|
|
90
|
+
"""Fetch a specific article by code and date.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
cod: Article disposition code (e.g. "2023-11737").
|
|
94
|
+
date: Date in dd/mm/YYYY format.
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
BulletinArticle with full content and metadata.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
if not cod or not date:
|
|
101
|
+
raise ValueError("Both 'cod' and 'date' must be provided.")
|
|
102
|
+
|
|
103
|
+
if datetime.strptime(date, "%d/%m/%Y") < DATE_MIN:
|
|
104
|
+
raise ValueError(f"date must be on or after {DATE_MIN}.")
|
|
105
|
+
|
|
106
|
+
return Article(cod=cod, date=date).get_article()
|
|
107
|
+
|
|
108
|
+
def get_articles(
|
|
109
|
+
self,
|
|
110
|
+
date_from: str,
|
|
111
|
+
date_to: str,
|
|
112
|
+
text_contains: str | None = None,
|
|
113
|
+
origin_contains: str | None = None,
|
|
114
|
+
) -> list[BulletinArticle]:
|
|
115
|
+
"""Fetch all articles in a date range.
|
|
116
|
+
|
|
117
|
+
Iterates over each day in the range, retrieves the bulletin summary,
|
|
118
|
+
and fetches the full content of every article listed.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
date_from: Start date in dd/mm/YYYY format.
|
|
122
|
+
date_to: End date in dd/mm/YYYY format.
|
|
123
|
+
text_contains: Optional string to filter bulletin entries by.
|
|
124
|
+
origin_contains: Optional string to filter bulletin entries
|
|
125
|
+
by origin.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
List of BulletinArticle objects in the range.
|
|
129
|
+
"""
|
|
130
|
+
articles = []
|
|
131
|
+
start_date = datetime.strptime(date_from, "%d/%m/%Y")
|
|
132
|
+
end_date = datetime.strptime(date_to, "%d/%m/%Y")
|
|
133
|
+
|
|
134
|
+
if start_date > end_date:
|
|
135
|
+
raise ValueError("date_from must be earlier than or equal to date_to.")
|
|
136
|
+
|
|
137
|
+
if start_date < DATE_MIN:
|
|
138
|
+
raise ValueError(f"date_from must be on or after {DATE_MIN}.")
|
|
139
|
+
|
|
140
|
+
current_date = start_date
|
|
141
|
+
while current_date <= end_date:
|
|
142
|
+
fecha_str = current_date.strftime("%d/%m/%Y")
|
|
143
|
+
try:
|
|
144
|
+
bulletin = self.get_bulletin(
|
|
145
|
+
fecha_str,
|
|
146
|
+
text_contains=text_contains,
|
|
147
|
+
origin_contains=origin_contains,
|
|
148
|
+
)
|
|
149
|
+
articles.extend(
|
|
150
|
+
Article(cod=cod, num=bulletin.num, date=bulletin.date).get_article()
|
|
151
|
+
for cod in bulletin.codes
|
|
152
|
+
)
|
|
153
|
+
except Exception:
|
|
154
|
+
pass
|
|
155
|
+
current_date += timedelta(days=1)
|
|
156
|
+
|
|
157
|
+
return articles
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Constants for BOPA web portal URLs and HTML element identifiers."""
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
BOPA_URL = "https://miprincipado.asturias.es/bopa"
|
|
7
|
+
|
|
8
|
+
DISPOSITONS_URL = BOPA_URL + "/disposiciones"
|
|
9
|
+
|
|
10
|
+
SUMMARY_URL = "https://miprincipado.asturias.es/bopa-sumario"
|
|
11
|
+
|
|
12
|
+
BOPA_ARTICLE_ID = "bopa-articulo"
|
|
13
|
+
BOPA_BULLETIN_ID = "bopa-boletin"
|
|
14
|
+
|
|
15
|
+
DATE_MIN = datetime(2000, 1, 1) # Minimum date for BOPA bulletins
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
import datetime
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class BulletinArticle:
|
|
8
|
+
"""Full content for one BOPA article.
|
|
9
|
+
|
|
10
|
+
Attributes:
|
|
11
|
+
code: Disposition code (e.g. "2023-11737").
|
|
12
|
+
num: Bulletin number.
|
|
13
|
+
date: Publication date.
|
|
14
|
+
origin: Origin hierarchy string (part / chapter / topic / sub-author).
|
|
15
|
+
content: List of text paragraphs comprising the article body.
|
|
16
|
+
link_html: URL to the HTML detail page.
|
|
17
|
+
link_pdf: URL to the PDF document.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
code: str
|
|
21
|
+
num: str
|
|
22
|
+
date: datetime
|
|
23
|
+
origin: str
|
|
24
|
+
content: list[str]
|
|
25
|
+
link_html: str
|
|
26
|
+
link_pdf: str
|
|
27
|
+
|
|
28
|
+
def to_dict(self) -> dict[str, object]:
|
|
29
|
+
"""Serialize the article to a dictionary.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
Dict with all fields; date is formatted as dd/mm/YYYY.
|
|
33
|
+
"""
|
|
34
|
+
return {
|
|
35
|
+
"code": self.code,
|
|
36
|
+
"num": self.num,
|
|
37
|
+
"date": self.date.strftime("%d/%m/%Y"),
|
|
38
|
+
"origin": self.origin,
|
|
39
|
+
"content": self.content,
|
|
40
|
+
"link_html": self.link_html,
|
|
41
|
+
"link_pdf": self.link_pdf,
|
|
42
|
+
}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class BulletinSummaryEntry:
|
|
7
|
+
"""One disposition entry listed in a BOPA bulletin summary.
|
|
8
|
+
|
|
9
|
+
Attributes:
|
|
10
|
+
code: Disposition code (e.g. "2023-11737"), or "N/A" if not found.
|
|
11
|
+
origin: Origin hierarchy string (part / chapter / topic / sub-author).
|
|
12
|
+
description: Short text description of the disposition.
|
|
13
|
+
link_html: URL to the HTML detail page.
|
|
14
|
+
link_pdf: URL to the PDF document.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
code: str
|
|
18
|
+
origin: str
|
|
19
|
+
description: str
|
|
20
|
+
link_html: str
|
|
21
|
+
link_pdf: str
|
|
22
|
+
|
|
23
|
+
def to_dict(self) -> dict[str, str]:
|
|
24
|
+
"""Serialize the entry to a dictionary.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
Dict with all fields.
|
|
28
|
+
"""
|
|
29
|
+
return {
|
|
30
|
+
"code": self.code,
|
|
31
|
+
"origin": self.origin,
|
|
32
|
+
"description": self.description,
|
|
33
|
+
"link_html": self.link_html,
|
|
34
|
+
"link_pdf": self.link_pdf,
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class BulletinSummary:
|
|
40
|
+
"""Structured summary for one BOPA bulletin.
|
|
41
|
+
|
|
42
|
+
Attributes:
|
|
43
|
+
num: Bulletin number.
|
|
44
|
+
date: Publication date.
|
|
45
|
+
summary: List of disposition entries.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
num: str
|
|
49
|
+
date: datetime
|
|
50
|
+
summary: list[BulletinSummaryEntry] = field(default_factory=list)
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def codes(self):
|
|
54
|
+
"""List of non-empty disposition codes in this bulletin."""
|
|
55
|
+
return [entry.code for entry in self.summary if entry.code != "N/A"]
|
|
56
|
+
|
|
57
|
+
def to_dict(self) -> dict[str, object]:
|
|
58
|
+
"""Serialize the bulletin summary to a dictionary.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
Dict with num, date (dd/mm/YYYY), and serialized entries.
|
|
62
|
+
"""
|
|
63
|
+
return {
|
|
64
|
+
"num": self.num,
|
|
65
|
+
"date": self.date.strftime("%d/%m/%Y"),
|
|
66
|
+
"summary": [entry.to_dict() for entry in self.summary],
|
|
67
|
+
}
|