ssb-pubmd 0.0.19__tar.gz → 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ssb_pubmd-0.0.19 → ssb_pubmd-0.1.1}/PKG-INFO +19 -22
- {ssb_pubmd-0.0.19 → ssb_pubmd-0.1.1}/README.md +9 -12
- ssb_pubmd-0.1.1/pyproject.toml +156 -0
- ssb_pubmd-0.1.1/ssb_pubmd/__init__.py +4 -0
- ssb_pubmd-0.1.1/ssb_pubmd/__main__.py +13 -0
- ssb_pubmd-0.1.1/ssb_pubmd/adapters/content_parser.py +185 -0
- ssb_pubmd-0.1.1/ssb_pubmd/adapters/document_processor.py +149 -0
- ssb_pubmd-0.1.1/ssb_pubmd/adapters/publish_client.py +124 -0
- ssb_pubmd-0.1.1/ssb_pubmd/adapters/storage.py +42 -0
- ssb_pubmd-0.1.1/ssb_pubmd/cli.py +78 -0
- ssb_pubmd-0.1.1/ssb_pubmd/config.py +23 -0
- ssb_pubmd-0.1.1/ssb_pubmd/domain/document_publisher.py +46 -0
- ssb_pubmd-0.1.1/ssb_pubmd/notebook_client.py +130 -0
- ssb_pubmd-0.0.19/pyproject.toml +0 -149
- ssb_pubmd-0.0.19/src/ssb_pubmd/__init__.py +0 -6
- ssb_pubmd-0.0.19/src/ssb_pubmd/__main__.py +0 -163
- ssb_pubmd-0.0.19/src/ssb_pubmd/browser_request_handler.py +0 -85
- ssb_pubmd-0.0.19/src/ssb_pubmd/constants.py +0 -22
- ssb_pubmd-0.0.19/src/ssb_pubmd/jwt_request_handler.py +0 -99
- ssb_pubmd-0.0.19/src/ssb_pubmd/markdown_syncer.py +0 -183
- ssb_pubmd-0.0.19/src/ssb_pubmd/request_handler.py +0 -56
- {ssb_pubmd-0.0.19 → ssb_pubmd-0.1.1}/LICENSE +0 -0
- {ssb_pubmd-0.0.19/src → ssb_pubmd-0.1.1}/ssb_pubmd/py.typed +0 -0
|
@@ -1,27 +1,27 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: ssb-pubmd
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.1.1
|
|
4
4
|
Summary: SSB Pubmd
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Olav Landsverk
|
|
7
7
|
Author-email: stud-oll@ssb.no
|
|
8
8
|
Requires-Python: >=3.10,<4.0
|
|
9
|
-
Classifier: Development Status ::
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
10
|
Classifier: License :: OSI Approved :: MIT License
|
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.10
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.11
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.12
|
|
15
15
|
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
-
Requires-Dist:
|
|
17
|
-
Requires-Dist:
|
|
18
|
-
Requires-Dist:
|
|
16
|
+
Requires-Dist: dapla-auth-client (>=1.2.5,<2.0.0)
|
|
17
|
+
Requires-Dist: ipynbname (>=2025.8.0.0,<2026.0.0.0)
|
|
18
|
+
Requires-Dist: narwhals (>=2.15.0,<3.0.0)
|
|
19
19
|
Requires-Dist: nbformat (>=5.10.4,<6.0.0)
|
|
20
|
-
Requires-Dist:
|
|
21
|
-
Requires-Dist:
|
|
22
|
-
Requires-Dist:
|
|
23
|
-
Requires-Dist: requests (>=2.32.
|
|
24
|
-
Requires-Dist:
|
|
20
|
+
Requires-Dist: nh3 (>=0.3.2,<0.4.0)
|
|
21
|
+
Requires-Dist: pandocfilters (>=1.5.1,<2.0.0)
|
|
22
|
+
Requires-Dist: pydantic (>=2.12.5,<3.0.0)
|
|
23
|
+
Requires-Dist: requests (>=2.32.4,<3.0.0)
|
|
24
|
+
Requires-Dist: watchfiles (>=1.1.1,<2.0.0)
|
|
25
25
|
Project-URL: Changelog, https://github.com/statisticsnorway/ssb-pubmd/releases
|
|
26
26
|
Project-URL: Documentation, https://statisticsnorway.github.io/ssb-pubmd
|
|
27
27
|
Project-URL: Homepage, https://github.com/statisticsnorway/ssb-pubmd
|
|
@@ -55,28 +55,25 @@ Description-Content-Type: text/markdown
|
|
|
55
55
|
[black]: https://github.com/psf/black
|
|
56
56
|
[poetry]: https://python-poetry.org/
|
|
57
57
|
|
|
58
|
+
## Features
|
|
58
59
|
|
|
59
|
-
|
|
60
|
+
- TODO
|
|
60
61
|
|
|
61
|
-
|
|
62
|
+
## Requirements
|
|
62
63
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
64
|
+
- TODO
|
|
65
|
+
|
|
66
|
+
## Installation
|
|
66
67
|
|
|
67
|
-
|
|
68
|
+
You can install _SSB Pubmd_ via [pip] from [PyPI]:
|
|
68
69
|
|
|
69
70
|
```console
|
|
70
|
-
|
|
71
|
+
pip install ssb-pubmd
|
|
71
72
|
```
|
|
72
73
|
|
|
73
74
|
## Usage
|
|
74
75
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
```console
|
|
78
|
-
pubmd
|
|
79
|
-
```
|
|
76
|
+
Please see the [Reference Guide] for details.
|
|
80
77
|
|
|
81
78
|
## Contributing
|
|
82
79
|
|
|
@@ -25,28 +25,25 @@
|
|
|
25
25
|
[black]: https://github.com/psf/black
|
|
26
26
|
[poetry]: https://python-poetry.org/
|
|
27
27
|
|
|
28
|
+
## Features
|
|
28
29
|
|
|
29
|
-
|
|
30
|
+
- TODO
|
|
30
31
|
|
|
31
|
-
|
|
32
|
+
## Requirements
|
|
32
33
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
34
|
+
- TODO
|
|
35
|
+
|
|
36
|
+
## Installation
|
|
36
37
|
|
|
37
|
-
|
|
38
|
+
You can install _SSB Pubmd_ via [pip] from [PyPI]:
|
|
38
39
|
|
|
39
40
|
```console
|
|
40
|
-
|
|
41
|
+
pip install ssb-pubmd
|
|
41
42
|
```
|
|
42
43
|
|
|
43
44
|
## Usage
|
|
44
45
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
```console
|
|
48
|
-
pubmd
|
|
49
|
-
```
|
|
46
|
+
Please see the [Reference Guide] for details.
|
|
50
47
|
|
|
51
48
|
## Contributing
|
|
52
49
|
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "ssb-pubmd"
|
|
3
|
+
version = "0.1.1"
|
|
4
|
+
description = "SSB Pubmd"
|
|
5
|
+
authors = ["Olav Landsverk <stud-oll@ssb.no>"]
|
|
6
|
+
license = "MIT"
|
|
7
|
+
readme = "README.md"
|
|
8
|
+
homepage = "https://github.com/statisticsnorway/ssb-pubmd"
|
|
9
|
+
repository = "https://github.com/statisticsnorway/ssb-pubmd"
|
|
10
|
+
documentation = "https://statisticsnorway.github.io/ssb-pubmd"
|
|
11
|
+
classifiers = ["Development Status :: 3 - Alpha"]
|
|
12
|
+
|
|
13
|
+
[tool.poetry.urls]
|
|
14
|
+
Changelog = "https://github.com/statisticsnorway/ssb-pubmd/releases"
|
|
15
|
+
|
|
16
|
+
[tool.poetry.dependencies]
|
|
17
|
+
python = "^3.10"
|
|
18
|
+
requests = "^2.32.4"
|
|
19
|
+
nbformat = "^5.10.4"
|
|
20
|
+
nh3 = "^0.3.2"
|
|
21
|
+
pandocfilters = "^1.5.1"
|
|
22
|
+
ipynbname = "^2025.8.0.0"
|
|
23
|
+
pydantic = "^2.12.5"
|
|
24
|
+
narwhals = "^2.15.0"
|
|
25
|
+
watchfiles = "^1.1.1"
|
|
26
|
+
dapla-auth-client = "^1.2.5"
|
|
27
|
+
|
|
28
|
+
[tool.poetry.group.dev.dependencies]
|
|
29
|
+
pygments = ">=2.10.0"
|
|
30
|
+
coverage = { extras = ["toml"], version = ">=6.2" }
|
|
31
|
+
furo = ">=2021.11.12"
|
|
32
|
+
mypy = ">=0.930"
|
|
33
|
+
pre-commit = ">=2.16.0"
|
|
34
|
+
pre-commit-hooks = ">=4.1.0"
|
|
35
|
+
ruff = ">=0.0.284"
|
|
36
|
+
pytest = ">=6.2.5"
|
|
37
|
+
sphinx = ">=6.2.1"
|
|
38
|
+
sphinx-autobuild = ">=2021.3.14"
|
|
39
|
+
sphinx-autodoc-typehints = ">=1.24.0"
|
|
40
|
+
sphinx-click = ">=3.0.2"
|
|
41
|
+
typeguard = ">=2.13.3"
|
|
42
|
+
xdoctest = { extras = ["colors"], version = ">=0.15.10" }
|
|
43
|
+
myst-parser = { version = ">=0.16.1" }
|
|
44
|
+
black = "^25.1.0"
|
|
45
|
+
darglint = "^1.8.1"
|
|
46
|
+
types-requests = "^2.32.4.20260107"
|
|
47
|
+
pandas = "^2.3.3"
|
|
48
|
+
types-pyyaml = "^6.0.12.20250915"
|
|
49
|
+
quarto = "^0.1.0"
|
|
50
|
+
|
|
51
|
+
[tool.pytest.ini_options]
|
|
52
|
+
pythonpath = ["ssb_pubmd"]
|
|
53
|
+
|
|
54
|
+
[tool.poetry.scripts]
|
|
55
|
+
ssb-pubmd = "ssb_pubmd.__main__:main"
|
|
56
|
+
|
|
57
|
+
[tool.coverage.paths]
|
|
58
|
+
source = ["ssb_pubmd", "*/site-packages"]
|
|
59
|
+
tests = ["tests", "*/tests"]
|
|
60
|
+
|
|
61
|
+
[tool.coverage.run]
|
|
62
|
+
branch = true
|
|
63
|
+
source = ["ssb_pubmd", "tests"]
|
|
64
|
+
relative_files = true
|
|
65
|
+
|
|
66
|
+
[tool.coverage.report]
|
|
67
|
+
show_missing = true
|
|
68
|
+
fail_under = 50
|
|
69
|
+
|
|
70
|
+
[tool.mypy]
|
|
71
|
+
strict = true
|
|
72
|
+
warn_unreachable = true
|
|
73
|
+
pretty = true
|
|
74
|
+
show_column_numbers = true
|
|
75
|
+
show_error_context = true
|
|
76
|
+
explicit_package_bases = true
|
|
77
|
+
|
|
78
|
+
[tool.ruff]
|
|
79
|
+
force-exclude = true # Apply excludes to pre-commit
|
|
80
|
+
show-fixes = true
|
|
81
|
+
src = ["src", "tests"]
|
|
82
|
+
target-version = "py311" # Minimum Python version supported
|
|
83
|
+
include = ["*.py", "*.pyi", "**/pyproject.toml", "*.ipynb"]
|
|
84
|
+
extend-exclude = [
|
|
85
|
+
"__pycache__",
|
|
86
|
+
"old",
|
|
87
|
+
".ipynb_checkpoints",
|
|
88
|
+
"noxfile.py",
|
|
89
|
+
"docs/conf.py",
|
|
90
|
+
]
|
|
91
|
+
|
|
92
|
+
# Ruff rules may be customized as desired: https://docs.astral.sh/ruff/rules/
|
|
93
|
+
[tool.ruff.lint]
|
|
94
|
+
select = [
|
|
95
|
+
"A", # prevent using keywords that clobber python builtins
|
|
96
|
+
"ANN", # check type annotations
|
|
97
|
+
"B", # bugbear: security warnings
|
|
98
|
+
"E", # pycodestyle
|
|
99
|
+
"F", # pyflakes
|
|
100
|
+
"ISC", # implicit string concatenation
|
|
101
|
+
"I", # sort imports
|
|
102
|
+
"UP", # alert you when better syntax is available in your python version
|
|
103
|
+
"RUF", # the ruff developer's own rules
|
|
104
|
+
]
|
|
105
|
+
ignore = [
|
|
106
|
+
"ANN202", # Don't requiere return type annotation for private functions.
|
|
107
|
+
"ANN401", # Allow type annotation with type Any.
|
|
108
|
+
"D100", # Supress undocumented-public-module. Only doc of public api required.
|
|
109
|
+
"FBT001", # Allow boolean positional arguments in a function.
|
|
110
|
+
"FBT002", # Allow boolean default positional arguments in a function.
|
|
111
|
+
"E402", # Supress module-import-not-at-top-of-file, needed in jupyter notebooks.
|
|
112
|
+
"E501", # Supress line-too-long warnings: trust black's judgement on this one.
|
|
113
|
+
"PLR2004", # Allow to compare with unnamed numerical constants.
|
|
114
|
+
]
|
|
115
|
+
|
|
116
|
+
[tool.ruff.lint.isort]
|
|
117
|
+
force-single-line = true
|
|
118
|
+
known-first-party = ["ssb_pubmd"]
|
|
119
|
+
|
|
120
|
+
[tool.ruff.lint.mccabe]
|
|
121
|
+
max-complexity = 15
|
|
122
|
+
|
|
123
|
+
[tool.ruff.lint.pydocstyle]
|
|
124
|
+
convention = "google" # You can also use "numpy".
|
|
125
|
+
|
|
126
|
+
[tool.ruff.lint.pylint]
|
|
127
|
+
max-args = 8
|
|
128
|
+
|
|
129
|
+
[tool.ruff.lint.pep8-naming]
|
|
130
|
+
classmethod-decorators = [
|
|
131
|
+
"classmethod",
|
|
132
|
+
"validator",
|
|
133
|
+
"root_validator",
|
|
134
|
+
"pydantic.validator",
|
|
135
|
+
]
|
|
136
|
+
|
|
137
|
+
[tool.ruff.lint.per-file-ignores]
|
|
138
|
+
"*/__init__.py" = ["F401"]
|
|
139
|
+
"**/tests/*" = [
|
|
140
|
+
"ANN001", # type annotations don't add value for test functions
|
|
141
|
+
"ANN002", # type annotations don't add value for test functions
|
|
142
|
+
"ANN003", # type annotations don't add value for test functions
|
|
143
|
+
"ANN201", # type annotations don't add value for test functions
|
|
144
|
+
"ANN204", # type annotations don't add value for test functions
|
|
145
|
+
"ANN205", # type annotations don't add value for test functions
|
|
146
|
+
"ANN206", # type annotations don't add value for test functions
|
|
147
|
+
"D100", # docstrings are overkill for test functions
|
|
148
|
+
"D101",
|
|
149
|
+
"D102",
|
|
150
|
+
"D103",
|
|
151
|
+
"S101", # asserts are encouraged in pytest
|
|
152
|
+
]
|
|
153
|
+
|
|
154
|
+
[build-system]
|
|
155
|
+
requires = ["poetry-core>=1.0.0"]
|
|
156
|
+
build-backend = "poetry.core.masonry.api"
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
from collections.abc import Mapping
|
|
2
|
+
from dataclasses import asdict
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Any
|
|
5
|
+
from typing import Literal
|
|
6
|
+
from typing import Protocol
|
|
7
|
+
|
|
8
|
+
import nh3
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class Content:
|
|
13
|
+
title: str
|
|
14
|
+
content_type: str
|
|
15
|
+
publish_folder: str | None = None
|
|
16
|
+
publish_id: str | None = None
|
|
17
|
+
|
|
18
|
+
def to_dict(self) -> dict[str, Any]:
|
|
19
|
+
return asdict(self)
|
|
20
|
+
|
|
21
|
+
def serialize(self) -> dict[str, Any]:
|
|
22
|
+
raise NotImplementedError()
|
|
23
|
+
|
|
24
|
+
class ContentParser(Protocol):
|
|
25
|
+
def parse(self, metadata: Mapping[str, Any], html: str | None) -> Content: ...
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class MimirContent(Content):
|
|
30
|
+
def is_publishable(self) -> bool:
|
|
31
|
+
if self.title == "":
|
|
32
|
+
return False
|
|
33
|
+
if self.publish_id is None and self.publish_folder is None:
|
|
34
|
+
return False
|
|
35
|
+
return True
|
|
36
|
+
|
|
37
|
+
def serialize(self) -> dict[str, Any]:
|
|
38
|
+
if not self.is_publishable():
|
|
39
|
+
raise Exception()
|
|
40
|
+
s: dict[str, Any] = {
|
|
41
|
+
"contentType": "mimir:" + self.content_type,
|
|
42
|
+
"displayName": self.title,
|
|
43
|
+
"parentPath": self.publish_folder,
|
|
44
|
+
"data": {},
|
|
45
|
+
}
|
|
46
|
+
if self.publish_id is not None:
|
|
47
|
+
s["_id"] = self.publish_id
|
|
48
|
+
return s
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class Author:
|
|
53
|
+
name: str
|
|
54
|
+
email: str
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class Article(MimirContent):
|
|
58
|
+
content_type: str = "article"
|
|
59
|
+
authors: list[Author] | None = None
|
|
60
|
+
ingress: str = ""
|
|
61
|
+
html_text: str = ""
|
|
62
|
+
|
|
63
|
+
def serialize(self) -> dict[str, Any]:
|
|
64
|
+
s = super().serialize()
|
|
65
|
+
if self.authors:
|
|
66
|
+
s["data"]["authorItemSet"] = [asdict(author) for author in self.authors]
|
|
67
|
+
s["data"]["ingress"] = self.ingress
|
|
68
|
+
s["data"]["articleText"] = self.html_text
|
|
69
|
+
return s
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
GraphType = Literal["line", "pie", "column", "bar", "area", "barNegative"]
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataclass
|
|
76
|
+
class Highchart(MimirContent):
|
|
77
|
+
content_type: str = "highchart"
|
|
78
|
+
graph_type: GraphType = "line"
|
|
79
|
+
html_table: str | None = None
|
|
80
|
+
tbml: str | None = None
|
|
81
|
+
xlabel: str = "x"
|
|
82
|
+
ylabel: str = "y"
|
|
83
|
+
|
|
84
|
+
def serialize(self) -> dict[str, Any]:
|
|
85
|
+
s = super().serialize()
|
|
86
|
+
|
|
87
|
+
if self.html_table is not None:
|
|
88
|
+
s["data"]["htmlTable"] = self.html_table
|
|
89
|
+
elif self.tbml is not None:
|
|
90
|
+
s["data"]["dataSource"] = {
|
|
91
|
+
"_selected": "tbprocessor",
|
|
92
|
+
"tbprocessor": {"urlOrId": self.tbml},
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
s["data"]["xAxisTitle"] = self.xlabel
|
|
96
|
+
s["data"]["yAxisTitle"] = self.ylabel
|
|
97
|
+
|
|
98
|
+
return s
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@dataclass
|
|
102
|
+
class FactBox(MimirContent):
|
|
103
|
+
content_type: str = "factBox"
|
|
104
|
+
display_type: Literal["default", "sneakPeek", "aiIcon"] = "default"
|
|
105
|
+
html_text: str = ""
|
|
106
|
+
|
|
107
|
+
def serialize(self) -> dict[str, Any]:
|
|
108
|
+
s = super().serialize()
|
|
109
|
+
s["data"]["expansionBoxType"] = self.display_type
|
|
110
|
+
s["data"]["text"] = self.html_text
|
|
111
|
+
return s
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
BASIC_HTML_TAGS = {
|
|
115
|
+
"p",
|
|
116
|
+
"br",
|
|
117
|
+
"strong",
|
|
118
|
+
"em",
|
|
119
|
+
"b",
|
|
120
|
+
"i",
|
|
121
|
+
"ul",
|
|
122
|
+
"ol",
|
|
123
|
+
"li",
|
|
124
|
+
"blockquote",
|
|
125
|
+
"h1",
|
|
126
|
+
"h2",
|
|
127
|
+
"h3",
|
|
128
|
+
"h4",
|
|
129
|
+
"h5",
|
|
130
|
+
"a",
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class MimirContentParser:
|
|
135
|
+
def parse(self, metadata: Mapping[str, Any], html: str | None) -> Content:
|
|
136
|
+
match metadata.get("content_type"):
|
|
137
|
+
case "article":
|
|
138
|
+
return self._parse_article(metadata, html)
|
|
139
|
+
case "factBox":
|
|
140
|
+
return self._parse_factbox(metadata, html)
|
|
141
|
+
case "highchart":
|
|
142
|
+
return self._parse_highchart(metadata, html)
|
|
143
|
+
case _:
|
|
144
|
+
return MimirContent(**metadata)
|
|
145
|
+
|
|
146
|
+
def serialize(self, content: Content) -> dict[str, Any]:
|
|
147
|
+
if isinstance(content, MimirContent):
|
|
148
|
+
return content.serialize()
|
|
149
|
+
else:
|
|
150
|
+
raise Exception()
|
|
151
|
+
|
|
152
|
+
@classmethod
|
|
153
|
+
def _parse_article(cls, metadata: Mapping[str, Any], html: str | None) -> Article:
|
|
154
|
+
article = Article(
|
|
155
|
+
title=metadata["title"],
|
|
156
|
+
publish_folder="/ssb" + metadata["path"],
|
|
157
|
+
publish_id=metadata.get("publish_id"),
|
|
158
|
+
authors=[Author(**data) for data in metadata.get("authors", [])],
|
|
159
|
+
ingress=metadata.get("ingress", ""),
|
|
160
|
+
)
|
|
161
|
+
if html is not None:
|
|
162
|
+
allowed_html_tags = BASIC_HTML_TAGS
|
|
163
|
+
html_text = nh3.clean(html, tags=allowed_html_tags)
|
|
164
|
+
article.html_text = html_text
|
|
165
|
+
return article
|
|
166
|
+
|
|
167
|
+
@classmethod
|
|
168
|
+
def _parse_factbox(cls, metadata: Mapping[str, Any], html: str | None) -> FactBox:
|
|
169
|
+
factbox = FactBox(**metadata)
|
|
170
|
+
if html is not None:
|
|
171
|
+
allowed_html_tags = BASIC_HTML_TAGS - {"h2"}
|
|
172
|
+
html_text = nh3.clean(html, tags=allowed_html_tags)
|
|
173
|
+
factbox.html_text = html_text
|
|
174
|
+
return factbox
|
|
175
|
+
|
|
176
|
+
@classmethod
|
|
177
|
+
def _parse_highchart(
|
|
178
|
+
cls, metadata: Mapping[str, Any], html: str | None
|
|
179
|
+
) -> Highchart:
|
|
180
|
+
highchart = Highchart(**metadata)
|
|
181
|
+
if html is not None:
|
|
182
|
+
allowed_html_tags = {"table", "tbody", "tr", "td"}
|
|
183
|
+
html_table = nh3.clean(html, tags=allowed_html_tags)
|
|
184
|
+
highchart.html_table = html_table
|
|
185
|
+
return highchart
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
|
|
2
|
+
import json
|
|
3
|
+
import subprocess
|
|
4
|
+
from collections.abc import Iterator
|
|
5
|
+
from typing import Any
|
|
6
|
+
from typing import NamedTuple
|
|
7
|
+
from typing import Protocol
|
|
8
|
+
from typing import TypedDict
|
|
9
|
+
|
|
10
|
+
import pandocfilters as pf # type: ignore
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Element(NamedTuple):
|
|
14
|
+
id: str
|
|
15
|
+
inner_html: str | None
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DocumentProcessor(Protocol):
|
|
19
|
+
def load(self, raw_content: str) -> None: ...
|
|
20
|
+
def extract_metadata(self, target_key: str) -> dict[str, Any]: ...
|
|
21
|
+
def extract_elements(self, target_class: str) -> Iterator[Element]: ...
|
|
22
|
+
def replace_element(self, id_: str, new_html: str) -> None: ...
|
|
23
|
+
def extract_html(self) -> str: ...
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class PandocElement(TypedDict):
|
|
28
|
+
t: str
|
|
29
|
+
c: Any
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
PandocDocument = TypedDict(
|
|
33
|
+
"PandocDocument",
|
|
34
|
+
{
|
|
35
|
+
"pandoc-api-version": list[int],
|
|
36
|
+
"meta": dict[str, Any],
|
|
37
|
+
"blocks": list[PandocElement],
|
|
38
|
+
},
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class PandocDocumentProcessor:
|
|
43
|
+
"""
|
|
44
|
+
Processor for a pandoc document, i.e. the JSON-serialized pandoc AST of a document.
|
|
45
|
+
|
|
46
|
+
Example pandoc AST with exactly one div:
|
|
47
|
+
|
|
48
|
+
```json
|
|
49
|
+
{
|
|
50
|
+
"pandoc-api-version": [1, 23, 1],
|
|
51
|
+
"meta": {},
|
|
52
|
+
"blocks": [
|
|
53
|
+
{
|
|
54
|
+
"t": "Div",
|
|
55
|
+
"c": [
|
|
56
|
+
["my-highchart", ["ssb"], [["title", "My highchart"]]],
|
|
57
|
+
[]
|
|
58
|
+
]
|
|
59
|
+
}
|
|
60
|
+
]
|
|
61
|
+
}
|
|
62
|
+
```
|
|
63
|
+
Html equivalent:
|
|
64
|
+
```html
|
|
65
|
+
<div id="my-highchart" class="ssb" title="My highchart">
|
|
66
|
+
</div>
|
|
67
|
+
```
|
|
68
|
+
References:
|
|
69
|
+
- Studying the result of command `pandoc FILE -t json`, where FILE is a minimal example document (e.g. Markdown or html).
|
|
70
|
+
- https://github.com/jgm/pandocfilters has some examples of how to work with the format.
|
|
71
|
+
- Note: no formal specification exists.
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
document: PandocDocument
|
|
75
|
+
_element_index: dict[str, int]
|
|
76
|
+
|
|
77
|
+
def load(self, raw_content: str) -> None:
|
|
78
|
+
self.document: PandocDocument = json.loads(raw_content)
|
|
79
|
+
self._element_index = {}
|
|
80
|
+
|
|
81
|
+
def extract_metadata(self, target_key: str) -> dict[str, Any]:
|
|
82
|
+
def meta_to_dict(meta: Any) -> Any:
|
|
83
|
+
t, c = meta.get("t"), meta.get("c")
|
|
84
|
+
if t == "MetaMap":
|
|
85
|
+
return {k: meta_to_dict(v) for k, v in c.items()}
|
|
86
|
+
elif t == "MetaList":
|
|
87
|
+
return [meta_to_dict(v) for v in c]
|
|
88
|
+
else:
|
|
89
|
+
return pf.stringify(c)
|
|
90
|
+
|
|
91
|
+
return meta_to_dict(self.document["meta"][target_key]) # type: ignore
|
|
92
|
+
|
|
93
|
+
def extract_html(self) -> str:
|
|
94
|
+
return self._document_to_html(self.document)
|
|
95
|
+
|
|
96
|
+
def extract_elements(self, target_class: str) -> Iterator[Element]:
|
|
97
|
+
self._element_index = self._generate_element_index(target_class)
|
|
98
|
+
|
|
99
|
+
for id_, i in self._element_index.items():
|
|
100
|
+
element = self.document["blocks"][i]
|
|
101
|
+
inner_blocks: list[PandocElement] = element["c"][1]
|
|
102
|
+
inner_html = self._blocks_to_html(inner_blocks) if inner_blocks else None
|
|
103
|
+
yield Element(id_, inner_html)
|
|
104
|
+
|
|
105
|
+
def replace_element(self, id_: str, new_html: str) -> None:
|
|
106
|
+
i = self._element_index[id_]
|
|
107
|
+
self.document["blocks"][i] = {
|
|
108
|
+
"t": "RawBlock",
|
|
109
|
+
"c": ["html", new_html],
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
def _generate_element_index(self, target_class: str) -> dict[str, int]:
|
|
113
|
+
index = {}
|
|
114
|
+
for i, element in enumerate(self.document["blocks"]):
|
|
115
|
+
if element["t"] != "Div":
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
id_: str = element["c"][0][0]
|
|
119
|
+
if not id_:
|
|
120
|
+
continue
|
|
121
|
+
|
|
122
|
+
classes: list[str] = element["c"][0][1]
|
|
123
|
+
if target_class not in classes:
|
|
124
|
+
continue
|
|
125
|
+
|
|
126
|
+
index[id_] = i
|
|
127
|
+
|
|
128
|
+
return index
|
|
129
|
+
|
|
130
|
+
@classmethod
|
|
131
|
+
def _blocks_to_html(cls, blocks: list[PandocElement]) -> str:
|
|
132
|
+
document: PandocDocument = {
|
|
133
|
+
"pandoc-api-version": [1, 23, 1],
|
|
134
|
+
"meta": {},
|
|
135
|
+
"blocks": blocks,
|
|
136
|
+
}
|
|
137
|
+
return cls._document_to_html(document)
|
|
138
|
+
|
|
139
|
+
@classmethod
|
|
140
|
+
def _document_to_html(cls, document: PandocDocument) -> str:
|
|
141
|
+
result = subprocess.run(
|
|
142
|
+
["pandoc", "-f", "json", "-t", "html"],
|
|
143
|
+
input=json.dumps(document),
|
|
144
|
+
text=True,
|
|
145
|
+
capture_output=True,
|
|
146
|
+
check=True,
|
|
147
|
+
)
|
|
148
|
+
html = result.stdout
|
|
149
|
+
return html
|