wo2net_delpher_tools 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Stichting WO2Net
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,54 @@
1
+ Metadata-Version: 2.3
2
+ Name: wo2net_delpher_tools
3
+ Version: 0.1.2
4
+ Summary: Tools for extracting and processing Delpher data for WO2Net
5
+ Author: Daan Raven
6
+ Author-email: Daan Raven <daan.raven@wo2net.nl>
7
+ License: MIT License
8
+
9
+ Copyright (c) 2026 Stichting WO2Net
10
+
11
+ Permission is hereby granted, free of charge, to any person obtaining a copy
12
+ of this software and associated documentation files (the "Software"), to deal
13
+ in the Software without restriction, including without limitation the rights
14
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15
+ copies of the Software, and to permit persons to whom the Software is
16
+ furnished to do so, subject to the following conditions:
17
+
18
+ The above copyright notice and this permission notice shall be included in all
19
+ copies or substantial portions of the Software.
20
+
21
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27
+ SOFTWARE.
28
+ Classifier: Programming Language :: Python :: 3
29
+ Classifier: Programming Language :: Python :: 3.9
30
+ Classifier: Programming Language :: Python :: 3.10
31
+ Classifier: Programming Language :: Python :: 3.11
32
+ Classifier: Programming Language :: Python :: 3.12
33
+ Classifier: License :: OSI Approved :: MIT License
34
+ Classifier: Operating System :: OS Independent
35
+ Classifier: Intended Audience :: Developers
36
+ Classifier: Intended Audience :: Science/Research
37
+ Classifier: Topic :: Text Processing :: Linguistic
38
+ Requires-Dist: lxml>=6.0.2
39
+ Requires-Dist: requests>=2.32.5
40
+ Requires-Dist: sickle>=0.7.0
41
+ Requires-Python: >=3.9.6
42
+ Project-URL: Homepage, https://github.com/Stichting-WO2Net/wo2net-delpher-tools
43
+ Project-URL: Repository, https://github.com/Stichting-WO2Net/wo2net-delpher-tools.git
44
+ Project-URL: Issues, https://github.com/Stichting-WO2Net/wo2net-delpher-tools/issues
45
+ Description-Content-Type: text/markdown
46
+
47
+ # WO2Net Delpher Tools
48
+
49
+ Tools for extracting and processing Delpher data for WO2Net.
50
+
51
+ ## Installatie
52
+
53
+ ```bash
54
+ pip install wo2net-delpher-tools
@@ -0,0 +1,8 @@
1
+ # WO2Net Delpher Tools
2
+
3
+ Tools for extracting and processing Delpher data for WO2Net.
4
+
5
+ ## Installatie
6
+
7
+ ```bash
8
+ pip install wo2net-delpher-tools
@@ -0,0 +1,39 @@
1
+ [build-system]
2
+ requires = ["uv>=0.9.24,<0.10.0"]
3
+ build-backend = "uv_build"
4
+
5
+ [tool.uv.extra-build-dependencies]
6
+ wo2net_delpher_tools = ["uv_build"]
7
+
8
+ [project]
9
+ name = "wo2net_delpher_tools"
10
+ version = "0.1.2"
11
+ description = "Tools for extracting and processing Delpher data for WO2Net"
12
+ readme = "README.md"
13
+ requires-python = ">=3.9.6"
14
+ authors = [
15
+ { name = "Daan Raven", email = "daan.raven@wo2net.nl" }
16
+ ]
17
+ license = {file = "LICENSE"}
18
+ classifiers = [
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.9",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.12",
24
+ "License :: OSI Approved :: MIT License",
25
+ "Operating System :: OS Independent",
26
+ "Intended Audience :: Developers",
27
+ "Intended Audience :: Science/Research",
28
+ "Topic :: Text Processing :: Linguistic",
29
+ ]
30
+ dependencies = [
31
+ "lxml>=6.0.2",
32
+ "requests>=2.32.5",
33
+ "sickle>=0.7.0",
34
+ ]
35
+
36
+ [project.urls]
37
+ Homepage = "https://github.com/Stichting-WO2Net/wo2net-delpher-tools"
38
+ Repository = "https://github.com/Stichting-WO2Net/wo2net-delpher-tools.git"
39
+ Issues = "https://github.com/Stichting-WO2Net/wo2net-delpher-tools/issues"
@@ -0,0 +1,6 @@
1
+ from .models.issue import Issue
2
+ from .models.page import Page
3
+ from .models.article import Article
4
+ from .pipeline import process_ppn_to_json
5
+ from .utils import save_issues_to_json
6
+ from .ppn import load_ppn_numbers_from_txt
@@ -0,0 +1,3 @@
1
+ from .issue import Issue
2
+ from .page import Page
3
+ from .article import Article
@@ -0,0 +1,28 @@
1
+ from typing import Optional
2
+
3
+ class Article:
4
+ def __init__(
5
+ self,
6
+ identifier: str,
7
+ title: str,
8
+ subject: Optional[str] = None,
9
+ url: Optional[str] = None,
10
+ ocr: Optional[str] = None,
11
+ ocr_text: Optional[str] = None
12
+ ):
13
+ self.identifier = identifier
14
+ self.title = title
15
+ self.subject = subject
16
+ self.url = url
17
+ self.ocr = ocr
18
+ self.ocr_text = ocr_text
19
+
20
+ def to_dict(self):
21
+ return {
22
+ 'identifier': self.identifier,
23
+ 'title': self.title,
24
+ 'subject': self.subject,
25
+ 'url': self.url,
26
+ 'ocr': self.ocr,
27
+ 'ocr_text': self.ocr_text
28
+ }
@@ -0,0 +1,156 @@
1
+ from typing import Optional
2
+ from .article import Article
3
+ from .page import Page
4
+ from lxml import etree
5
+
6
+ class Issue:
7
+ def __init__(
8
+ self,
9
+ identifier: str,
10
+ url: str,
11
+ ppn: str,
12
+ title: Optional[str] = None,
13
+ date: Optional[str] = None,
14
+ source: Optional[str] = None,
15
+ rights: Optional[str] = None,
16
+ publisher: Optional[str] = None,
17
+ volume: Optional[str] = None,
18
+ issue_number: Optional[str] = None,
19
+ issued: Optional[str] = None,
20
+ spatial: Optional[str] = None,
21
+ articles: Optional[list[Article]] = None,
22
+ pages: Optional[list[Page]] = None
23
+ ):
24
+ self.identifier = identifier
25
+ self.url = url
26
+ self.ppn = ppn
27
+ self.title = title
28
+ self.source = source
29
+ self.date = date
30
+ self.rights = rights
31
+ self.publisher = publisher
32
+ self.volume = volume
33
+ self.issue_number = issue_number
34
+ self.issued = issued
35
+ self.spatial = spatial
36
+ self.articles = articles or []
37
+ self.pages = pages or []
38
+
39
+ @classmethod
40
+ def from_oai_record(cls, record):
41
+ root = etree.fromstring(record.raw)
42
+ ns = {
43
+ 'didl': 'urn:mpeg:mpeg21:2002:02-DIDL-NS',
44
+ 'dc': 'http://purl.org/dc/elements/1.1/',
45
+ 'dcx': 'http://krait.kb.nl/coop/tel/handbook/telterms.html',
46
+ 'oai': 'http://www.openarchives.org/OAI/2.0/',
47
+ 'srw_dc': 'info:srw/schema/1/dc-v1.1',
48
+ 'dcterms': 'http://purl.org/dc/terms/',
49
+ 'dcmitype': 'http://purl.org/dc/dcmitype/',
50
+ 'ddd': 'http://www.kb.nl/namespaces/ddd',
51
+ 'didmodel': 'urn:mpeg:mpeg21:2002:02-DIDMODEL-NS',
52
+ 'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
53
+ }
54
+
55
+ def get_text(element, xpath, namespaces):
56
+ result = element.xpath(xpath, namespaces=namespaces)
57
+ return result[0].text if result else None
58
+
59
+ def get_attribute(element, xpath, namespaces):
60
+ result = element.xpath(xpath, namespaces=namespaces)
61
+ return result[0] if result else None
62
+
63
+ issue_identifier = root.xpath('//oai:header/oai:identifier', namespaces = ns)[0].text.split('DDD:')[-1]
64
+ url = 'https://resolver.kb.nl/resolve?urn=' + issue_identifier.split(':mpeg21')[0]
65
+ item = root.xpath(f'//didl:Item[@dc:identifier="{issue_identifier}"]', namespaces = ns)[0]
66
+ metadata = item.xpath(f'./didl:Component[@dc:identifier="{issue_identifier}:metadata"]/didl:Resource/srw_dc:dcx', namespaces = ns)[0]
67
+ ppn = get_text(metadata, './dc:identifier[@xsi:type="dcx:PPN"]', ns)
68
+ title = get_text(metadata, './dc:title', ns)
69
+ date = get_text(metadata, './dc:date', ns)
70
+ source = get_text(metadata, './dc:source', ns)
71
+ rights = get_text(metadata, './dc:rights', ns)
72
+ publisher = get_text(metadata, './dc:publisher', ns)
73
+ volume = get_text(metadata, './dcx:volume', ns)
74
+ issue_number = get_text(metadata, './dcx:issuenumber', ns)
75
+ issued = get_text(metadata, './dcterms:issued', ns)
76
+ spatial = get_text(metadata, './dcterms:spatial', ns)
77
+
78
+ issue = cls(
79
+ identifier = issue_identifier,
80
+ url = url,
81
+ ppn = ppn,
82
+ title = title,
83
+ date = date,
84
+ source = source,
85
+ rights = rights,
86
+ publisher = publisher,
87
+ volume = volume,
88
+ issue_number = issue_number,
89
+ issued = issued,
90
+ spatial = spatial
91
+ )
92
+
93
+ article_indicator = f"{issue_identifier}:a"
94
+ article_items = item.xpath(f'./didl:Item[contains(@dc:identifier, "{article_indicator}")]', namespaces=ns)
95
+
96
+ for article in article_items:
97
+ identifier = article.get('{http://purl.org/dc/elements/1.1/}identifier')
98
+ article_metadata = article.xpath(f'./didl:Component[@dc:identifier="{identifier}:metadata"]/didl:Resource/srw_dc:dcx', namespaces = ns)[0]
99
+ title = get_text(article_metadata, './dc:title', ns)
100
+ subject = get_text(article_metadata, './dc:subject', ns)
101
+ url = get_text(article_metadata, './dc:identifier', ns)
102
+ ocr = get_attribute(article, f'./didl:Component[@dc:identifier="{identifier}:ocr"]/didl:Resource/@ref', ns)
103
+ article_object = Article (
104
+ identifier = identifier,
105
+ title = title,
106
+ subject = subject,
107
+ url = url,
108
+ ocr = ocr
109
+ )
110
+ issue.articles.append(article_object)
111
+
112
+ page_indicator = f"{issue_identifier}:p"
113
+ page_items = item.xpath(f'./didl:Item[contains(@dc:identifier, "{page_indicator}")]', namespaces=ns)
114
+ for page in page_items:
115
+ identifier = page.get('{http://purl.org/dc/elements/1.1/}identifier')
116
+ page_metadata = page.xpath(f'./didl:Component[@dc:identifier="{identifier}:metadata"]/didl:Resource/srw_dc:dcx', namespaces = ns)[0]
117
+ page_number = get_text(page_metadata, './ddd:nativePageNumber', ns)
118
+ image_url = get_attribute(page, f'./didl:Component[@dc:identifier="{identifier}:image"]/didl:Resource/@ref', ns)
119
+ page_url = get_text(page_metadata, './dc:identifier', ns)
120
+
121
+ page_articles = []
122
+ page_object = Page(
123
+ identifier = identifier,
124
+ page_number = page_number,
125
+ image = image_url,
126
+ url = page_url,
127
+ articles = page_articles
128
+ )
129
+
130
+ page_article_items = page.xpath(f'.//didl:Item[@ddd:article_id]', namespaces=ns)
131
+ for page_article_item in page_article_items:
132
+ article_id = page_article_item.get('{http://www.kb.nl/namespaces/ddd}article_id')
133
+ for article in issue.articles:
134
+ if article.identifier == article_id:
135
+ page_object.articles.append(article)
136
+
137
+ issue.pages.append(page_object)
138
+ return issue
139
+
140
+ def to_dict(self):
141
+ return {
142
+ 'identifier': self.identifier,
143
+ 'url': self.url,
144
+ 'ppn': self.ppn,
145
+ 'title': self.title,
146
+ 'date': self.date,
147
+ 'source': self.source,
148
+ 'rights': self.rights,
149
+ 'publisher': self.publisher,
150
+ 'volume': self.volume,
151
+ 'issue_number': self.issue_number,
152
+ 'issued': self.issued,
153
+ 'spatial': self.spatial,
154
+ 'articles': [article.to_dict() for article in self.articles],
155
+ 'pages': [page.to_dict() for page in self.pages]
156
+ }
@@ -0,0 +1,26 @@
1
+ from typing import Optional
2
+ from .article import Article
3
+
4
+ class Page:
5
+ def __init__(
6
+ self,
7
+ identifier: str,
8
+ page_number: Optional[int] = None,
9
+ image: Optional[str] = None,
10
+ url: Optional[str] = None,
11
+ articles: Optional[list[Article]] = None,
12
+ ):
13
+ self.identifier = identifier
14
+ self.page_number = page_number
15
+ self.image = image
16
+ self.url = url
17
+ self.articles = articles
18
+
19
+ def to_dict(self):
20
+ return {
21
+ 'identifier': self.identifier,
22
+ 'page_number': self.page_number,
23
+ 'image': self.image,
24
+ 'url': self.url,
25
+ 'article_identifiers': [article.identifier for article in self.articles]
26
+ }
@@ -0,0 +1,7 @@
1
+ from sickle import Sickle
2
+
3
+ def oai_get_record(identifier: str, prefix: str = 'DDD:'):
4
+ oai_identifier = prefix + identifier
5
+ sickle = Sickle('https://services.kb.nl/mdo/oai')
6
+ record = sickle.GetRecord(identifier=oai_identifier, metadataPrefix='didl')
7
+ return record
@@ -0,0 +1,22 @@
1
+ from .oai import oai_get_record
2
+ from .sru import get_issue_identifiers
3
+ from .utils import save_issues_to_json
4
+ from .models.issue import Issue
5
+
6
+ def process_ppn_to_json(ppn: str, output_file: str = "issues.json") -> list[Issue]:
7
+ print(f"Verwerken gestart voor PPN: {ppn}")
8
+
9
+ issue_identifiers = get_issue_identifiers(ppn)
10
+ print(f"Gevonden {len(issue_identifiers)} issue-identifiers voor PPN {ppn}")
11
+
12
+ issues = []
13
+ for identifier in issue_identifiers:
14
+ record = oai_get_record(identifier)
15
+ issue = Issue.from_oai_record(record)
16
+ issues.append(issue)
17
+ print(f"Issue verwerkt: {issue.identifier}")
18
+
19
+ save_issues_to_json(issues, output_file)
20
+ print(f"Verwerken voltooid. Resultaten opgeslagen in {output_file}")
21
+
22
+ return issues
@@ -0,0 +1,7 @@
1
+ def load_ppn_numbers_from_txt(file_path: str) -> list[str]:
2
+ ppn_numbers = []
3
+ with open(file_path, encoding="utf-8") as f:
4
+ for line in f:
5
+ line = line.strip()
6
+ ppn_numbers.append(line)
7
+ return ppn_numbers
@@ -0,0 +1,41 @@
1
+ import requests
2
+ from urllib.parse import quote
3
+ from lxml import etree
4
+
5
+ def _get_delpher_item_identifiers_from_ppn(ppn_number: int, constraints: str = "(date >= 1940) AND (date <= 1945)", collection: str = 'DDD_artikel', max_records: int = 25):
6
+ identifiers = []
7
+ query = quote(f"{constraints} AND ppn any ({ppn_number})")
8
+ start_record = 1
9
+
10
+ while True:
11
+ url = f"https://jsru.kb.nl/sru/sru?operation=searchRetrieve&version=1.2&recordSchema=dc&query={query}&x-collection={collection}&maximumRecords={max_records}&startRecord={start_record}"
12
+
13
+ response = requests.get(url)
14
+
15
+ if response.status_code != 200:
16
+ raise Exception(f"SRU request failed: {response.status_code}")
17
+
18
+ content = response.content
19
+ root = etree.fromstring(content)
20
+ ns = {
21
+ "srw": "http://www.loc.gov/zing/srw/",
22
+ "dc": "http://purl.org/dc/elements/1.1/"
23
+ }
24
+
25
+ nr_of_records = int(root.xpath("srw:numberOfRecords/text()", namespaces=ns)[0])
26
+ batch_identifiers = root.xpath("//srw:recordData/dc:identifier/text()", namespaces=ns)
27
+ identifiers.extend(batch_identifiers)
28
+
29
+
30
+ if start_record + max_records > nr_of_records:
31
+ break
32
+ start_record += max_records
33
+ return identifiers
34
+
35
+ def get_issue_identifiers(ppn_number: int, constraints: str = "(date >= 1940) AND (date <= 1945)", collection: str = 'DDD_artikel', max_records: int = 25):
36
+ delpher_item_identifiers = _get_delpher_item_identifiers_from_ppn(ppn_number, constraints, collection, max_records)
37
+ issue_identifiers = set()
38
+ for item in delpher_item_identifiers:
39
+ issue_id = item.split("urn=")[1].split(':a0')[0]
40
+ issue_identifiers.add(issue_id)
41
+ return list(issue_identifiers)
@@ -0,0 +1,8 @@
1
+ import json
2
+ from .models.issue import Issue
3
+
4
+ def save_issues_to_json(issues: list[Issue], file_path: str) -> None:
5
+ issues_data = [issue.to_dict() for issue in issues]
6
+
7
+ with open(file_path, 'w') as f:
8
+ json.dump(issues_data, f, indent=4)