ssb-pubmd 0.0.19__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,27 +1,27 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: ssb-pubmd
3
- Version: 0.0.19
3
+ Version: 0.1.1
4
4
  Summary: SSB Pubmd
5
5
  License: MIT
6
6
  Author: Olav Landsverk
7
7
  Author-email: stud-oll@ssb.no
8
8
  Requires-Python: >=3.10,<4.0
9
- Classifier: Development Status :: 1 - Planning
9
+ Classifier: Development Status :: 3 - Alpha
10
10
  Classifier: License :: OSI Approved :: MIT License
11
11
  Classifier: Programming Language :: Python :: 3
12
12
  Classifier: Programming Language :: Python :: 3.10
13
13
  Classifier: Programming Language :: Python :: 3.11
14
14
  Classifier: Programming Language :: Python :: 3.12
15
15
  Classifier: Programming Language :: Python :: 3.13
16
- Requires-Dist: click (>=8.0.1)
17
- Requires-Dist: cryptography (>=45.0.4,<46.0.0)
18
- Requires-Dist: google-cloud-secret-manager (>=2.24.0,<3.0.0)
16
+ Requires-Dist: dapla-auth-client (>=1.2.5,<2.0.0)
17
+ Requires-Dist: ipynbname (>=2025.8.0.0,<2026.0.0.0)
18
+ Requires-Dist: narwhals (>=2.15.0,<3.0.0)
19
19
  Requires-Dist: nbformat (>=5.10.4,<6.0.0)
20
- Requires-Dist: platformdirs (>=4.3.8,<5.0.0)
21
- Requires-Dist: playwright (>=1.51.0,<2.0.0)
22
- Requires-Dist: pyjwt (>=2.10.1,<3.0.0)
23
- Requires-Dist: requests (>=2.32.3,<3.0.0)
24
- Requires-Dist: types-requests (>=2.32.0.20250306,<3.0.0.0)
20
+ Requires-Dist: nh3 (>=0.3.2,<0.4.0)
21
+ Requires-Dist: pandocfilters (>=1.5.1,<2.0.0)
22
+ Requires-Dist: pydantic (>=2.12.5,<3.0.0)
23
+ Requires-Dist: requests (>=2.32.4,<3.0.0)
24
+ Requires-Dist: watchfiles (>=1.1.1,<2.0.0)
25
25
  Project-URL: Changelog, https://github.com/statisticsnorway/ssb-pubmd/releases
26
26
  Project-URL: Documentation, https://statisticsnorway.github.io/ssb-pubmd
27
27
  Project-URL: Homepage, https://github.com/statisticsnorway/ssb-pubmd
@@ -55,28 +55,25 @@ Description-Content-Type: text/markdown
55
55
  [black]: https://github.com/psf/black
56
56
  [poetry]: https://python-poetry.org/
57
57
 
58
+ ## Features
58
59
 
59
- ## Installation
60
+ - TODO
60
61
 
61
- Installation with pip:
62
+ ## Requirements
62
63
 
63
- ```console
64
- pip install ssb-pubmd
65
- ```
64
+ - TODO
65
+
66
+ ## Installation
66
67
 
67
- If you need to create a logged-in browser context, you will also need to install a [Playwright browser](https://playwright.dev/python/docs/browsers#install-browsers):
68
+ You can install _SSB Pubmd_ via [pip] from [PyPI]:
68
69
 
69
70
  ```console
70
- playwright install --with-deps chromium
71
+ pip install ssb-pubmd
71
72
  ```
72
73
 
73
74
  ## Usage
74
75
 
75
- Run the main command in a terminal to see available subcommands with documentation:
76
-
77
- ```console
78
- pubmd
79
- ```
76
+ Please see the [Reference Guide] for details.
80
77
 
81
78
  ## Contributing
82
79
 
@@ -25,28 +25,25 @@
25
25
  [black]: https://github.com/psf/black
26
26
  [poetry]: https://python-poetry.org/
27
27
 
28
+ ## Features
28
29
 
29
- ## Installation
30
+ - TODO
30
31
 
31
- Installation with pip:
32
+ ## Requirements
32
33
 
33
- ```console
34
- pip install ssb-pubmd
35
- ```
34
+ - TODO
35
+
36
+ ## Installation
36
37
 
37
- If you need to create a logged-in browser context, you will also need to install a [Playwright browser](https://playwright.dev/python/docs/browsers#install-browsers):
38
+ You can install _SSB Pubmd_ via [pip] from [PyPI]:
38
39
 
39
40
  ```console
40
- playwright install --with-deps chromium
41
+ pip install ssb-pubmd
41
42
  ```
42
43
 
43
44
  ## Usage
44
45
 
45
- Run the main command in a terminal to see available subcommands with documentation:
46
-
47
- ```console
48
- pubmd
49
- ```
46
+ Please see the [Reference Guide] for details.
50
47
 
51
48
  ## Contributing
52
49
 
@@ -0,0 +1,156 @@
1
+ [tool.poetry]
2
+ name = "ssb-pubmd"
3
+ version = "0.1.1"
4
+ description = "SSB Pubmd"
5
+ authors = ["Olav Landsverk <stud-oll@ssb.no>"]
6
+ license = "MIT"
7
+ readme = "README.md"
8
+ homepage = "https://github.com/statisticsnorway/ssb-pubmd"
9
+ repository = "https://github.com/statisticsnorway/ssb-pubmd"
10
+ documentation = "https://statisticsnorway.github.io/ssb-pubmd"
11
+ classifiers = ["Development Status :: 3 - Alpha"]
12
+
13
+ [tool.poetry.urls]
14
+ Changelog = "https://github.com/statisticsnorway/ssb-pubmd/releases"
15
+
16
+ [tool.poetry.dependencies]
17
+ python = "^3.10"
18
+ requests = "^2.32.4"
19
+ nbformat = "^5.10.4"
20
+ nh3 = "^0.3.2"
21
+ pandocfilters = "^1.5.1"
22
+ ipynbname = "^2025.8.0.0"
23
+ pydantic = "^2.12.5"
24
+ narwhals = "^2.15.0"
25
+ watchfiles = "^1.1.1"
26
+ dapla-auth-client = "^1.2.5"
27
+
28
+ [tool.poetry.group.dev.dependencies]
29
+ pygments = ">=2.10.0"
30
+ coverage = { extras = ["toml"], version = ">=6.2" }
31
+ furo = ">=2021.11.12"
32
+ mypy = ">=0.930"
33
+ pre-commit = ">=2.16.0"
34
+ pre-commit-hooks = ">=4.1.0"
35
+ ruff = ">=0.0.284"
36
+ pytest = ">=6.2.5"
37
+ sphinx = ">=6.2.1"
38
+ sphinx-autobuild = ">=2021.3.14"
39
+ sphinx-autodoc-typehints = ">=1.24.0"
40
+ sphinx-click = ">=3.0.2"
41
+ typeguard = ">=2.13.3"
42
+ xdoctest = { extras = ["colors"], version = ">=0.15.10" }
43
+ myst-parser = { version = ">=0.16.1" }
44
+ black = "^25.1.0"
45
+ darglint = "^1.8.1"
46
+ types-requests = "^2.32.4.20260107"
47
+ pandas = "^2.3.3"
48
+ types-pyyaml = "^6.0.12.20250915"
49
+ quarto = "^0.1.0"
50
+
51
+ [tool.pytest.ini_options]
52
+ pythonpath = ["ssb_pubmd"]
53
+
54
+ [tool.poetry.scripts]
55
+ ssb-pubmd = "ssb_pubmd.__main__:main"
56
+
57
+ [tool.coverage.paths]
58
+ source = ["ssb_pubmd", "*/site-packages"]
59
+ tests = ["tests", "*/tests"]
60
+
61
+ [tool.coverage.run]
62
+ branch = true
63
+ source = ["ssb_pubmd", "tests"]
64
+ relative_files = true
65
+
66
+ [tool.coverage.report]
67
+ show_missing = true
68
+ fail_under = 50
69
+
70
+ [tool.mypy]
71
+ strict = true
72
+ warn_unreachable = true
73
+ pretty = true
74
+ show_column_numbers = true
75
+ show_error_context = true
76
+ explicit_package_bases = true
77
+
78
+ [tool.ruff]
79
+ force-exclude = true # Apply excludes to pre-commit
80
+ show-fixes = true
81
+ src = ["src", "tests"]
82
+ target-version = "py311" # Minimum Python version supported
83
+ include = ["*.py", "*.pyi", "**/pyproject.toml", "*.ipynb"]
84
+ extend-exclude = [
85
+ "__pycache__",
86
+ "old",
87
+ ".ipynb_checkpoints",
88
+ "noxfile.py",
89
+ "docs/conf.py",
90
+ ]
91
+
92
+ # Ruff rules may be customized as desired: https://docs.astral.sh/ruff/rules/
93
+ [tool.ruff.lint]
94
+ select = [
95
+ "A", # prevent using keywords that clobber python builtins
96
+ "ANN", # check type annotations
97
+ "B", # bugbear: security warnings
98
+ "E", # pycodestyle
99
+ "F", # pyflakes
100
+ "ISC", # implicit string concatenation
101
+ "I", # sort imports
102
+ "UP", # alert you when better syntax is available in your python version
103
+ "RUF", # the ruff developer's own rules
104
+ ]
105
+ ignore = [
106
+ "ANN202", # Don't requiere return type annotation for private functions.
107
+ "ANN401", # Allow type annotation with type Any.
108
+ "D100", # Supress undocumented-public-module. Only doc of public api required.
109
+ "FBT001", # Allow boolean positional arguments in a function.
110
+ "FBT002", # Allow boolean default positional arguments in a function.
111
+ "E402", # Supress module-import-not-at-top-of-file, needed in jupyter notebooks.
112
+ "E501", # Supress line-too-long warnings: trust black's judgement on this one.
113
+ "PLR2004", # Allow to compare with unnamed numerical constants.
114
+ ]
115
+
116
+ [tool.ruff.lint.isort]
117
+ force-single-line = true
118
+ known-first-party = ["ssb_pubmd"]
119
+
120
+ [tool.ruff.lint.mccabe]
121
+ max-complexity = 15
122
+
123
+ [tool.ruff.lint.pydocstyle]
124
+ convention = "google" # You can also use "numpy".
125
+
126
+ [tool.ruff.lint.pylint]
127
+ max-args = 8
128
+
129
+ [tool.ruff.lint.pep8-naming]
130
+ classmethod-decorators = [
131
+ "classmethod",
132
+ "validator",
133
+ "root_validator",
134
+ "pydantic.validator",
135
+ ]
136
+
137
+ [tool.ruff.lint.per-file-ignores]
138
+ "*/__init__.py" = ["F401"]
139
+ "**/tests/*" = [
140
+ "ANN001", # type annotations don't add value for test functions
141
+ "ANN002", # type annotations don't add value for test functions
142
+ "ANN003", # type annotations don't add value for test functions
143
+ "ANN201", # type annotations don't add value for test functions
144
+ "ANN204", # type annotations don't add value for test functions
145
+ "ANN205", # type annotations don't add value for test functions
146
+ "ANN206", # type annotations don't add value for test functions
147
+ "D100", # docstrings are overkill for test functions
148
+ "D101",
149
+ "D102",
150
+ "D103",
151
+ "S101", # asserts are encouraged in pytest
152
+ ]
153
+
154
+ [build-system]
155
+ requires = ["poetry-core>=1.0.0"]
156
+ build-backend = "poetry.core.masonry.api"
@@ -0,0 +1,4 @@
1
+ from ssb_pubmd.notebook_client import configure_factbox as Factbox
2
+ from ssb_pubmd.notebook_client import create_highchart as Highchart
3
+
4
+ __all__ = ["Factbox", "Highchart"]
@@ -0,0 +1,13 @@
1
+ import sys
2
+
3
+ from ssb_pubmd.cli import run_cli
4
+ from ssb_pubmd.config import get_config
5
+
6
+
7
+ def main() -> None:
8
+ config = get_config()
9
+ run_cli(sys.argv, config)
10
+
11
+
12
+ if __name__ == "__main__":
13
+ main()
@@ -0,0 +1,185 @@
1
+ from collections.abc import Mapping
2
+ from dataclasses import asdict
3
+ from dataclasses import dataclass
4
+ from typing import Any
5
+ from typing import Literal
6
+ from typing import Protocol
7
+
8
+ import nh3
9
+
10
+
11
+ @dataclass
12
+ class Content:
13
+ title: str
14
+ content_type: str
15
+ publish_folder: str | None = None
16
+ publish_id: str | None = None
17
+
18
+ def to_dict(self) -> dict[str, Any]:
19
+ return asdict(self)
20
+
21
+ def serialize(self) -> dict[str, Any]:
22
+ raise NotImplementedError()
23
+
24
+ class ContentParser(Protocol):
25
+ def parse(self, metadata: Mapping[str, Any], html: str | None) -> Content: ...
26
+
27
+
28
+ @dataclass
29
+ class MimirContent(Content):
30
+ def is_publishable(self) -> bool:
31
+ if self.title == "":
32
+ return False
33
+ if self.publish_id is None and self.publish_folder is None:
34
+ return False
35
+ return True
36
+
37
+ def serialize(self) -> dict[str, Any]:
38
+ if not self.is_publishable():
39
+ raise Exception()
40
+ s: dict[str, Any] = {
41
+ "contentType": "mimir:" + self.content_type,
42
+ "displayName": self.title,
43
+ "parentPath": self.publish_folder,
44
+ "data": {},
45
+ }
46
+ if self.publish_id is not None:
47
+ s["_id"] = self.publish_id
48
+ return s
49
+
50
+
51
+ @dataclass
52
+ class Author:
53
+ name: str
54
+ email: str
55
+
56
+ @dataclass
57
+ class Article(MimirContent):
58
+ content_type: str = "article"
59
+ authors: list[Author] | None = None
60
+ ingress: str = ""
61
+ html_text: str = ""
62
+
63
+ def serialize(self) -> dict[str, Any]:
64
+ s = super().serialize()
65
+ if self.authors:
66
+ s["data"]["authorItemSet"] = [asdict(author) for author in self.authors]
67
+ s["data"]["ingress"] = self.ingress
68
+ s["data"]["articleText"] = self.html_text
69
+ return s
70
+
71
+
72
+ GraphType = Literal["line", "pie", "column", "bar", "area", "barNegative"]
73
+
74
+
75
+ @dataclass
76
+ class Highchart(MimirContent):
77
+ content_type: str = "highchart"
78
+ graph_type: GraphType = "line"
79
+ html_table: str | None = None
80
+ tbml: str | None = None
81
+ xlabel: str = "x"
82
+ ylabel: str = "y"
83
+
84
+ def serialize(self) -> dict[str, Any]:
85
+ s = super().serialize()
86
+
87
+ if self.html_table is not None:
88
+ s["data"]["htmlTable"] = self.html_table
89
+ elif self.tbml is not None:
90
+ s["data"]["dataSource"] = {
91
+ "_selected": "tbprocessor",
92
+ "tbprocessor": {"urlOrId": self.tbml},
93
+ }
94
+
95
+ s["data"]["xAxisTitle"] = self.xlabel
96
+ s["data"]["yAxisTitle"] = self.ylabel
97
+
98
+ return s
99
+
100
+
101
+ @dataclass
102
+ class FactBox(MimirContent):
103
+ content_type: str = "factBox"
104
+ display_type: Literal["default", "sneakPeek", "aiIcon"] = "default"
105
+ html_text: str = ""
106
+
107
+ def serialize(self) -> dict[str, Any]:
108
+ s = super().serialize()
109
+ s["data"]["expansionBoxType"] = self.display_type
110
+ s["data"]["text"] = self.html_text
111
+ return s
112
+
113
+
114
+ BASIC_HTML_TAGS = {
115
+ "p",
116
+ "br",
117
+ "strong",
118
+ "em",
119
+ "b",
120
+ "i",
121
+ "ul",
122
+ "ol",
123
+ "li",
124
+ "blockquote",
125
+ "h1",
126
+ "h2",
127
+ "h3",
128
+ "h4",
129
+ "h5",
130
+ "a",
131
+ }
132
+
133
+
134
+ class MimirContentParser:
135
+ def parse(self, metadata: Mapping[str, Any], html: str | None) -> Content:
136
+ match metadata.get("content_type"):
137
+ case "article":
138
+ return self._parse_article(metadata, html)
139
+ case "factBox":
140
+ return self._parse_factbox(metadata, html)
141
+ case "highchart":
142
+ return self._parse_highchart(metadata, html)
143
+ case _:
144
+ return MimirContent(**metadata)
145
+
146
+ def serialize(self, content: Content) -> dict[str, Any]:
147
+ if isinstance(content, MimirContent):
148
+ return content.serialize()
149
+ else:
150
+ raise Exception()
151
+
152
+ @classmethod
153
+ def _parse_article(cls, metadata: Mapping[str, Any], html: str | None) -> Article:
154
+ article = Article(
155
+ title=metadata["title"],
156
+ publish_folder="/ssb" + metadata["path"],
157
+ publish_id=metadata.get("publish_id"),
158
+ authors=[Author(**data) for data in metadata.get("authors", [])],
159
+ ingress=metadata.get("ingress", ""),
160
+ )
161
+ if html is not None:
162
+ allowed_html_tags = BASIC_HTML_TAGS
163
+ html_text = nh3.clean(html, tags=allowed_html_tags)
164
+ article.html_text = html_text
165
+ return article
166
+
167
+ @classmethod
168
+ def _parse_factbox(cls, metadata: Mapping[str, Any], html: str | None) -> FactBox:
169
+ factbox = FactBox(**metadata)
170
+ if html is not None:
171
+ allowed_html_tags = BASIC_HTML_TAGS - {"h2"}
172
+ html_text = nh3.clean(html, tags=allowed_html_tags)
173
+ factbox.html_text = html_text
174
+ return factbox
175
+
176
+ @classmethod
177
+ def _parse_highchart(
178
+ cls, metadata: Mapping[str, Any], html: str | None
179
+ ) -> Highchart:
180
+ highchart = Highchart(**metadata)
181
+ if html is not None:
182
+ allowed_html_tags = {"table", "tbody", "tr", "td"}
183
+ html_table = nh3.clean(html, tags=allowed_html_tags)
184
+ highchart.html_table = html_table
185
+ return highchart
@@ -0,0 +1,149 @@
1
+
2
+ import json
3
+ import subprocess
4
+ from collections.abc import Iterator
5
+ from typing import Any
6
+ from typing import NamedTuple
7
+ from typing import Protocol
8
+ from typing import TypedDict
9
+
10
+ import pandocfilters as pf # type: ignore
11
+
12
+
13
+ class Element(NamedTuple):
14
+ id: str
15
+ inner_html: str | None
16
+
17
+
18
+ class DocumentProcessor(Protocol):
19
+ def load(self, raw_content: str) -> None: ...
20
+ def extract_metadata(self, target_key: str) -> dict[str, Any]: ...
21
+ def extract_elements(self, target_class: str) -> Iterator[Element]: ...
22
+ def replace_element(self, id_: str, new_html: str) -> None: ...
23
+ def extract_html(self) -> str: ...
24
+
25
+
26
+
27
+ class PandocElement(TypedDict):
28
+ t: str
29
+ c: Any
30
+
31
+
32
+ PandocDocument = TypedDict(
33
+ "PandocDocument",
34
+ {
35
+ "pandoc-api-version": list[int],
36
+ "meta": dict[str, Any],
37
+ "blocks": list[PandocElement],
38
+ },
39
+ )
40
+
41
+
42
+ class PandocDocumentProcessor:
43
+ """
44
+ Processor for a pandoc document, i.e. the JSON-serialized pandoc AST of a document.
45
+
46
+ Example pandoc AST with exactly one div:
47
+
48
+ ```json
49
+ {
50
+ "pandoc-api-version": [1, 23, 1],
51
+ "meta": {},
52
+ "blocks": [
53
+ {
54
+ "t": "Div",
55
+ "c": [
56
+ ["my-highchart", ["ssb"], [["title", "My highchart"]]],
57
+ []
58
+ ]
59
+ }
60
+ ]
61
+ }
62
+ ```
63
+ Html equivalent:
64
+ ```html
65
+ <div id="my-highchart" class="ssb" title="My highchart">
66
+ </div>
67
+ ```
68
+ References:
69
+ - Studying the result of command `pandoc FILE -t json`, where FILE is a minimal example document (e.g. Markdown or html).
70
+ - https://github.com/jgm/pandocfilters has some examples of how to work with the format.
71
+ - Note: no formal specification exists.
72
+ """
73
+
74
+ document: PandocDocument
75
+ _element_index: dict[str, int]
76
+
77
+ def load(self, raw_content: str) -> None:
78
+ self.document: PandocDocument = json.loads(raw_content)
79
+ self._element_index = {}
80
+
81
+ def extract_metadata(self, target_key: str) -> dict[str, Any]:
82
+ def meta_to_dict(meta: Any) -> Any:
83
+ t, c = meta.get("t"), meta.get("c")
84
+ if t == "MetaMap":
85
+ return {k: meta_to_dict(v) for k, v in c.items()}
86
+ elif t == "MetaList":
87
+ return [meta_to_dict(v) for v in c]
88
+ else:
89
+ return pf.stringify(c)
90
+
91
+ return meta_to_dict(self.document["meta"][target_key]) # type: ignore
92
+
93
+ def extract_html(self) -> str:
94
+ return self._document_to_html(self.document)
95
+
96
+ def extract_elements(self, target_class: str) -> Iterator[Element]:
97
+ self._element_index = self._generate_element_index(target_class)
98
+
99
+ for id_, i in self._element_index.items():
100
+ element = self.document["blocks"][i]
101
+ inner_blocks: list[PandocElement] = element["c"][1]
102
+ inner_html = self._blocks_to_html(inner_blocks) if inner_blocks else None
103
+ yield Element(id_, inner_html)
104
+
105
+ def replace_element(self, id_: str, new_html: str) -> None:
106
+ i = self._element_index[id_]
107
+ self.document["blocks"][i] = {
108
+ "t": "RawBlock",
109
+ "c": ["html", new_html],
110
+ }
111
+
112
+ def _generate_element_index(self, target_class: str) -> dict[str, int]:
113
+ index = {}
114
+ for i, element in enumerate(self.document["blocks"]):
115
+ if element["t"] != "Div":
116
+ continue
117
+
118
+ id_: str = element["c"][0][0]
119
+ if not id_:
120
+ continue
121
+
122
+ classes: list[str] = element["c"][0][1]
123
+ if target_class not in classes:
124
+ continue
125
+
126
+ index[id_] = i
127
+
128
+ return index
129
+
130
+ @classmethod
131
+ def _blocks_to_html(cls, blocks: list[PandocElement]) -> str:
132
+ document: PandocDocument = {
133
+ "pandoc-api-version": [1, 23, 1],
134
+ "meta": {},
135
+ "blocks": blocks,
136
+ }
137
+ return cls._document_to_html(document)
138
+
139
+ @classmethod
140
+ def _document_to_html(cls, document: PandocDocument) -> str:
141
+ result = subprocess.run(
142
+ ["pandoc", "-f", "json", "-t", "html"],
143
+ input=json.dumps(document),
144
+ text=True,
145
+ capture_output=True,
146
+ check=True,
147
+ )
148
+ html = result.stdout
149
+ return html