thonny-html-highlight 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thonny_html_highlight-0.1.0/PKG-INFO +83 -0
- thonny_html_highlight-0.1.0/README.md +73 -0
- thonny_html_highlight-0.1.0/pyproject.toml +24 -0
- thonny_html_highlight-0.1.0/src/thonnycontrib/html_highlight/__init__.py +112 -0
- thonny_html_highlight-0.1.0/src/thonnycontrib/html_highlight/highlighter.py +149 -0
- thonny_html_highlight-0.1.0/src/thonnycontrib/html_highlight/tokenizer.py +188 -0
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: thonny-html-highlight
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Thonny plugin: syntax highlighting for HTML files
|
|
5
|
+
Author: Geoff Matheson
|
|
6
|
+
Author-email: Geoff Matheson <geoff.matheson@education.vic.gov.au>
|
|
7
|
+
Requires-Dist: thonny>=5.0
|
|
8
|
+
Requires-Python: >=3.13
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
|
|
11
|
+
# thonny-html-highlight
|
|
12
|
+
|
|
13
|
+
Syntax highlighting for HTML files in the [Thonny](https://thonny.org) IDE.
|
|
14
|
+
|
|
15
|
+
By default Thonny treats `.html` and `.htm` files as plain text. This plugin adds colour highlighting for:
|
|
16
|
+
|
|
17
|
+
- Tag names (`div`, `span`, `a`, …)
|
|
18
|
+
- Attribute names (`class`, `href`, `data-value`, …)
|
|
19
|
+
- Attribute values (`"foo"`, `'bar'`, …)
|
|
20
|
+
- HTML comments (`<!-- … -->`)
|
|
21
|
+
- DOCTYPE declarations (`<!DOCTYPE html>`)
|
|
22
|
+
- Entity references (`&`, `©`, `©`, …)
|
|
23
|
+
- Angle-bracket punctuation
|
|
24
|
+
|
|
25
|
+
## Requirements
|
|
26
|
+
|
|
27
|
+
- Thonny 5.0 or later
|
|
28
|
+
- Python 3.13 or later (bundled with Thonny 5)
|
|
29
|
+
|
|
30
|
+
## Installation
|
|
31
|
+
|
|
32
|
+
### Via Thonny's built-in package manager (recommended)
|
|
33
|
+
|
|
34
|
+
1. Open Thonny.
|
|
35
|
+
2. Go to **Tools → Manage packages…**
|
|
36
|
+
3. Search for `thonny-html-highlight`.
|
|
37
|
+
4. Click **Install**.
|
|
38
|
+
5. Restart Thonny.
|
|
39
|
+
|
|
40
|
+
### Via pip / uv (command line)
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
# Using pip
|
|
44
|
+
pip install thonny-html-highlight
|
|
45
|
+
|
|
46
|
+
# Using uv
|
|
47
|
+
uv pip install thonny-html-highlight
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Then restart Thonny.
|
|
51
|
+
|
|
52
|
+
### From source (development)
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
git clone https://github.com/mrmatho/thonny-html-highlight.git
|
|
56
|
+
cd thonny-html-highlight
|
|
57
|
+
uv pip install -e .
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
Restart Thonny after installing.
|
|
61
|
+
|
|
62
|
+
## Usage
|
|
63
|
+
|
|
64
|
+
Open any `.html` or `.htm` file in Thonny — highlighting is applied automatically. No configuration is required.
|
|
65
|
+
|
|
66
|
+
## Known limitations
|
|
67
|
+
|
|
68
|
+
- Content inside `<script>` and `<style>` tags is treated as HTML, which may produce incorrect highlighting for JavaScript or CSS within those blocks.
|
|
69
|
+
- Colours follow Thonny's default light theme. Integration with custom syntax themes is planned for a future release.
|
|
70
|
+
|
|
71
|
+
## Development
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
# Run tests
|
|
75
|
+
uv run pytest
|
|
76
|
+
|
|
77
|
+
# Run tests with verbose output
|
|
78
|
+
uv run pytest -v
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Licence
|
|
82
|
+
|
|
83
|
+
MIT
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# thonny-html-highlight
|
|
2
|
+
|
|
3
|
+
Syntax highlighting for HTML files in the [Thonny](https://thonny.org) IDE.
|
|
4
|
+
|
|
5
|
+
By default Thonny treats `.html` and `.htm` files as plain text. This plugin adds colour highlighting for:
|
|
6
|
+
|
|
7
|
+
- Tag names (`div`, `span`, `a`, …)
|
|
8
|
+
- Attribute names (`class`, `href`, `data-value`, …)
|
|
9
|
+
- Attribute values (`"foo"`, `'bar'`, …)
|
|
10
|
+
- HTML comments (`<!-- … -->`)
|
|
11
|
+
- DOCTYPE declarations (`<!DOCTYPE html>`)
|
|
12
|
+
- Entity references (`&`, `©`, `©`, …)
|
|
13
|
+
- Angle-bracket punctuation
|
|
14
|
+
|
|
15
|
+
## Requirements
|
|
16
|
+
|
|
17
|
+
- Thonny 5.0 or later
|
|
18
|
+
- Python 3.13 or later (bundled with Thonny 5)
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
|
|
22
|
+
### Via Thonny's built-in package manager (recommended)
|
|
23
|
+
|
|
24
|
+
1. Open Thonny.
|
|
25
|
+
2. Go to **Tools → Manage packages…**
|
|
26
|
+
3. Search for `thonny-html-highlight`.
|
|
27
|
+
4. Click **Install**.
|
|
28
|
+
5. Restart Thonny.
|
|
29
|
+
|
|
30
|
+
### Via pip / uv (command line)
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
# Using pip
|
|
34
|
+
pip install thonny-html-highlight
|
|
35
|
+
|
|
36
|
+
# Using uv
|
|
37
|
+
uv pip install thonny-html-highlight
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Then restart Thonny.
|
|
41
|
+
|
|
42
|
+
### From source (development)
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
git clone https://github.com/mrmatho/thonny-html-highlight.git
|
|
46
|
+
cd thonny-html-highlight
|
|
47
|
+
uv pip install -e .
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Restart Thonny after installing.
|
|
51
|
+
|
|
52
|
+
## Usage
|
|
53
|
+
|
|
54
|
+
Open any `.html` or `.htm` file in Thonny — highlighting is applied automatically. No configuration is required.
|
|
55
|
+
|
|
56
|
+
## Known limitations
|
|
57
|
+
|
|
58
|
+
- Content inside `<script>` and `<style>` tags is treated as HTML, which may produce incorrect highlighting for JavaScript or CSS within those blocks.
|
|
59
|
+
- Colours follow Thonny's default light theme. Integration with custom syntax themes is planned for a future release.
|
|
60
|
+
|
|
61
|
+
## Development
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
# Run tests
|
|
65
|
+
uv run pytest
|
|
66
|
+
|
|
67
|
+
# Run tests with verbose output
|
|
68
|
+
uv run pytest -v
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Licence
|
|
72
|
+
|
|
73
|
+
MIT
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "thonny-html-highlight"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Thonny plugin: syntax highlighting for HTML files"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "Geoff Matheson", email = "geoff.matheson@education.vic.gov.au" }
|
|
8
|
+
]
|
|
9
|
+
requires-python = ">=3.13"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"thonny>=5.0",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
[build-system]
|
|
15
|
+
requires = ["uv_build>=0.9.7,<0.10.0"]
|
|
16
|
+
build-backend = "uv_build"
|
|
17
|
+
|
|
18
|
+
[tool.uv.build-backend]
|
|
19
|
+
module-name = "thonnycontrib.html_highlight"
|
|
20
|
+
module-root = "src"
|
|
21
|
+
|
|
22
|
+
[tool.pytest.ini_options]
|
|
23
|
+
testpaths = ["tests"]
|
|
24
|
+
pythonpath = ["src"]
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""Thonny plugin: HTML syntax highlighting for ``.html`` / ``.htm`` files.
|
|
2
|
+
|
|
3
|
+
When Thonny loads this plugin it calls :func:`load_plugin`, which binds
|
|
4
|
+
event handlers onto the ``EditorCodeViewText`` widget class. On every text
|
|
5
|
+
change in any editor, the handler checks whether the file is HTML (by
|
|
6
|
+
inspecting its path) and, if so, keeps an :class:`.HtmlHighlighter` instance
|
|
7
|
+
alive on the widget and schedules a re-highlight.
|
|
8
|
+
|
|
9
|
+
Setting ``file_type = "html"`` on the widget tells Thonny to use plain-text
|
|
10
|
+
editing behaviour (simple Return, standard Backspace) rather than Python-aware
|
|
11
|
+
behaviour. It also gives future extensions (auto-indentation, tag
|
|
12
|
+
auto-closing) a clean hook point.
|
|
13
|
+
|
|
14
|
+
Widget hierarchy assumed by :func:`_get_editor_path`:
|
|
15
|
+
|
|
16
|
+
``EditorCodeViewText`` → ``CodeView`` → ``BaseEditor``
|
|
17
|
+
|
|
18
|
+
``BaseEditor.get_target_path()`` returns the file path or ``None`` for
|
|
19
|
+
untitled buffers.
|
|
20
|
+
"""
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import tkinter as tk
|
|
24
|
+
|
|
25
|
+
from .highlighter import HtmlHighlighter
|
|
26
|
+
|
|
27
|
+
_HTML_EXTENSIONS = (".html", ".htm")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# ---------------------------------------------------------------------------
|
|
31
|
+
# Helpers
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _get_editor_path(text: tk.Text) -> str | None:
|
|
36
|
+
"""Return the file path for *text*'s editor, or ``None`` if unavailable."""
|
|
37
|
+
try:
|
|
38
|
+
editor = text.master.master # CodeView → BaseEditor
|
|
39
|
+
return editor.get_target_path()
|
|
40
|
+
except AttributeError:
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _is_html_file(text: tk.Text) -> bool:
|
|
45
|
+
"""Return ``True`` when *text* is displaying an HTML file."""
|
|
46
|
+
path = _get_editor_path(text)
|
|
47
|
+
return path is not None and path.lower().endswith(_HTML_EXTENSIONS)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# ---------------------------------------------------------------------------
|
|
51
|
+
# Event handlers
|
|
52
|
+
# ---------------------------------------------------------------------------
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _on_text_change(event: tk.Event) -> None:
|
|
56
|
+
"""Called on every ``<<TextChange>>`` in any editor text widget."""
|
|
57
|
+
text = event.widget
|
|
58
|
+
if not isinstance(text, tk.Text):
|
|
59
|
+
return
|
|
60
|
+
|
|
61
|
+
if not _is_html_file(text):
|
|
62
|
+
return
|
|
63
|
+
|
|
64
|
+
# Ensure this widget has a highlighter.
|
|
65
|
+
if not hasattr(text, "_html_highlighter"):
|
|
66
|
+
text._html_highlighter = HtmlHighlighter(text)
|
|
67
|
+
|
|
68
|
+
# Keep file_type set to "html" so Thonny uses plain-text editing
|
|
69
|
+
# behaviour. update_file_type() resets this on save/rename, so we
|
|
70
|
+
# re-assert it on each text change.
|
|
71
|
+
if getattr(text, "file_type", None) != "html":
|
|
72
|
+
if hasattr(text, "set_file_type"):
|
|
73
|
+
text.set_file_type("html")
|
|
74
|
+
|
|
75
|
+
text._html_highlighter.schedule_update()
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _on_appearance_change(event: tk.Event) -> None:
|
|
79
|
+
"""Called when the user switches Thonny's syntax theme.
|
|
80
|
+
|
|
81
|
+
Reconfigures tag colours and re-highlights every open HTML editor so
|
|
82
|
+
that the new theme takes effect immediately.
|
|
83
|
+
"""
|
|
84
|
+
from thonny import get_workbench # noqa: PLC0415 — deferred to avoid import at test time
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
notebook = get_workbench().get_editor_notebook()
|
|
88
|
+
for editor in notebook.get_all_editors():
|
|
89
|
+
try:
|
|
90
|
+
text = editor._code_view.text
|
|
91
|
+
hl: HtmlHighlighter | None = getattr(text, "_html_highlighter", None)
|
|
92
|
+
if hl is not None:
|
|
93
|
+
hl.configure_tags()
|
|
94
|
+
hl.schedule_update()
|
|
95
|
+
except Exception:
|
|
96
|
+
pass
|
|
97
|
+
except Exception:
|
|
98
|
+
pass
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# ---------------------------------------------------------------------------
|
|
102
|
+
# Plugin entry point
|
|
103
|
+
# ---------------------------------------------------------------------------
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def load_plugin() -> None:
|
|
107
|
+
"""Register the HTML highlighting plugin with Thonny's workbench."""
|
|
108
|
+
from thonny import get_workbench # noqa: PLC0415 — deferred to avoid import at test time
|
|
109
|
+
|
|
110
|
+
wb = get_workbench()
|
|
111
|
+
wb.bind_class("EditorCodeViewText", "<<TextChange>>", _on_text_change, True)
|
|
112
|
+
wb.bind("<<UpdateAppearance>>", _on_appearance_change, True)
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""Applies HTML syntax highlighting to a Thonny editor text widget.
|
|
2
|
+
|
|
3
|
+
The :class:`HtmlHighlighter` owns all interaction with the tkinter Text
|
|
4
|
+
widget. It reads tokens from :mod:`.tokenizer` and applies named tags so
|
|
5
|
+
that Thonny's theme system can override colours in the future.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import tkinter as tk
|
|
10
|
+
from typing import TYPE_CHECKING, List
|
|
11
|
+
|
|
12
|
+
from .tokenizer import Token, offsets_to_tkindices, tokenize_html
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
pass # kept for future type-only imports (e.g. SyntaxText)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# ---------------------------------------------------------------------------
|
|
19
|
+
# Tag configuration
|
|
20
|
+
# ---------------------------------------------------------------------------
|
|
21
|
+
|
|
22
|
+
# Default colours for each token category. These are intentionally chosen
|
|
23
|
+
# to complement Thonny's built-in light theme; a future version will read
|
|
24
|
+
# values from Thonny's syntax-theme API so that dark themes work correctly.
|
|
25
|
+
_TAG_COLORS: dict[str, dict] = {
|
|
26
|
+
"html_comment": {"foreground": "#808080"},
|
|
27
|
+
"html_doctype": {"foreground": "#999999"},
|
|
28
|
+
"html_tag_name": {"foreground": "#000080"},
|
|
29
|
+
"html_attr_name": {"foreground": "#912B6C"},
|
|
30
|
+
"html_attr_value":{"foreground": "#007A00"},
|
|
31
|
+
"html_entity": {"foreground": "#007070"},
|
|
32
|
+
"html_bracket": {"foreground": "#606060"},
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
# Map from tokenizer token type → tkinter tag name.
|
|
36
|
+
_TOKEN_TO_TAG: dict[str, str] = {
|
|
37
|
+
"comment": "html_comment",
|
|
38
|
+
"doctype": "html_doctype",
|
|
39
|
+
"tag_name": "html_tag_name",
|
|
40
|
+
"attr_name": "html_attr_name",
|
|
41
|
+
"attr_value":"html_attr_value",
|
|
42
|
+
"entity": "html_entity",
|
|
43
|
+
"bracket": "html_bracket",
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
ALL_HTML_TAGS: List[str] = list(_TOKEN_TO_TAG.values())
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# ---------------------------------------------------------------------------
|
|
50
|
+
# Highlighter
|
|
51
|
+
# ---------------------------------------------------------------------------
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class HtmlHighlighter:
|
|
55
|
+
"""Manages HTML syntax highlighting for a single editor text widget.
|
|
56
|
+
|
|
57
|
+
One instance is created per text widget the first time an HTML file is
|
|
58
|
+
opened in that widget. It is stored as ``text._html_highlighter`` so
|
|
59
|
+
that it is garbage-collected when the widget is destroyed.
|
|
60
|
+
|
|
61
|
+
Updating is *scheduled* rather than immediate: when :meth:`schedule_update`
|
|
62
|
+
is called (e.g. on every keystroke), it posts a single ``after_idle``
|
|
63
|
+
callback, deduplicating rapid successive calls.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
def __init__(self, text: tk.Text) -> None:
|
|
67
|
+
self._text = text
|
|
68
|
+
self._update_scheduled = False
|
|
69
|
+
self.configure_tags()
|
|
70
|
+
|
|
71
|
+
# ------------------------------------------------------------------
|
|
72
|
+
# Public interface
|
|
73
|
+
# ------------------------------------------------------------------
|
|
74
|
+
|
|
75
|
+
def configure_tags(self) -> None:
|
|
76
|
+
"""Configure tkinter tag appearance.
|
|
77
|
+
|
|
78
|
+
Called once on initialisation and again whenever the Thonny
|
|
79
|
+
appearance changes (``<<UpdateAppearance>>`` event).
|
|
80
|
+
|
|
81
|
+
.. todo::
|
|
82
|
+
Read colours from Thonny's syntax-theme API so that dark
|
|
83
|
+
themes are respected automatically.
|
|
84
|
+
"""
|
|
85
|
+
for tag, opts in _TAG_COLORS.items():
|
|
86
|
+
self._text.tag_configure(tag, **opts)
|
|
87
|
+
|
|
88
|
+
# HTML tags must be raised above the default text tag so they
|
|
89
|
+
# are visible. They sit below the selection tag so selections
|
|
90
|
+
# still look normal.
|
|
91
|
+
for tag in ALL_HTML_TAGS:
|
|
92
|
+
try:
|
|
93
|
+
self._text.tag_raise(tag)
|
|
94
|
+
except tk.TclError:
|
|
95
|
+
pass # tag may not exist yet on first call
|
|
96
|
+
|
|
97
|
+
def schedule_update(self) -> None:
|
|
98
|
+
"""Schedule a re-highlight to run when the event loop is idle.
|
|
99
|
+
|
|
100
|
+
Multiple calls before the idle callback fires are collapsed into
|
|
101
|
+
one update, preventing redundant work during rapid typing.
|
|
102
|
+
"""
|
|
103
|
+
if not self._update_scheduled:
|
|
104
|
+
self._update_scheduled = True
|
|
105
|
+
self._text.after_idle(self._do_update)
|
|
106
|
+
|
|
107
|
+
def update(self) -> None:
|
|
108
|
+
"""Re-tokenise and re-highlight the full widget content immediately."""
|
|
109
|
+
self._update_scheduled = False
|
|
110
|
+
self._apply_highlighting()
|
|
111
|
+
|
|
112
|
+
# ------------------------------------------------------------------
|
|
113
|
+
# Private helpers
|
|
114
|
+
# ------------------------------------------------------------------
|
|
115
|
+
|
|
116
|
+
def _do_update(self) -> None:
|
|
117
|
+
"""Idle callback — delegates to :meth:`update`."""
|
|
118
|
+
self.update()
|
|
119
|
+
|
|
120
|
+
def _apply_highlighting(self) -> None:
|
|
121
|
+
# Remove all existing HTML tags in one pass before adding new ones.
|
|
122
|
+
for tag in ALL_HTML_TAGS:
|
|
123
|
+
self._text.tag_remove(tag, "1.0", "end")
|
|
124
|
+
|
|
125
|
+
content = self._text.get("1.0", "end-1c")
|
|
126
|
+
if not content:
|
|
127
|
+
return
|
|
128
|
+
|
|
129
|
+
tokens: List[Token] = tokenize_html(content)
|
|
130
|
+
if not tokens:
|
|
131
|
+
return
|
|
132
|
+
|
|
133
|
+
# Collect every start/end offset so we can compute all tkinter
|
|
134
|
+
# indices in a single O(n) pass rather than one pass per token.
|
|
135
|
+
all_offsets = []
|
|
136
|
+
for token in tokens:
|
|
137
|
+
all_offsets.append(token.start)
|
|
138
|
+
all_offsets.append(token.end)
|
|
139
|
+
|
|
140
|
+
index_map = offsets_to_tkindices(content, all_offsets)
|
|
141
|
+
|
|
142
|
+
for token in tokens:
|
|
143
|
+
tag = _TOKEN_TO_TAG.get(token.type)
|
|
144
|
+
if tag is None:
|
|
145
|
+
continue
|
|
146
|
+
start_idx = index_map[token.start]
|
|
147
|
+
end_idx = index_map[token.end]
|
|
148
|
+
if start_idx != end_idx:
|
|
149
|
+
self._text.tag_add(tag, start_idx, end_idx)
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
"""HTML tokenizer for syntax highlighting.
|
|
2
|
+
|
|
3
|
+
Contains only pure functions — no tkinter or Thonny imports — so every
|
|
4
|
+
function here can be unit-tested in complete isolation.
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
from typing import Dict, List, NamedTuple
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Token(NamedTuple):
|
|
13
|
+
"""A single syntax token within an HTML document.
|
|
14
|
+
|
|
15
|
+
Attributes:
|
|
16
|
+
type: One of ``'comment'``, ``'doctype'``, ``'entity'``,
|
|
17
|
+
``'bracket'``, ``'tag_name'``, ``'attr_name'``,
|
|
18
|
+
``'attr_value'``.
|
|
19
|
+
start: Inclusive start offset in the source string.
|
|
20
|
+
end: Exclusive end offset in the source string.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
type: str
|
|
24
|
+
start: int
|
|
25
|
+
end: int
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# ---------------------------------------------------------------------------
|
|
29
|
+
# Compiled regular expressions
|
|
30
|
+
# ---------------------------------------------------------------------------
|
|
31
|
+
|
|
32
|
+
# Matches the four top-level HTML constructs.
|
|
33
|
+
#
|
|
34
|
+
# The 'tag' alternative handles both opening and closing tags (including
|
|
35
|
+
# self-closing). Quoted attribute values are matched explicitly so that a
|
|
36
|
+
# '>' character inside a value does not prematurely end the tag match.
|
|
37
|
+
_TOP_LEVEL_RE = re.compile(
|
|
38
|
+
r"(?P<comment><!--[\s\S]*?-->)"
|
|
39
|
+
r"|(?P<doctype><!DOCTYPE[^>]*>)"
|
|
40
|
+
r"|(?P<entity>&(?:[a-zA-Z][a-zA-Z0-9]*|#\d+|#x[0-9a-fA-F]+);)"
|
|
41
|
+
r'|(?P<tag></?[a-zA-Z][a-zA-Z0-9_:-]*(?:"[^"]*"|\'[^\']*\'|[^>])*>)',
|
|
42
|
+
re.DOTALL | re.IGNORECASE,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
# Parses the internal structure of a single matched tag string.
|
|
46
|
+
_TAG_RE = re.compile(
|
|
47
|
+
r'^<(?P<slash>/?)(?P<name>[a-zA-Z][a-zA-Z0-9_:-]*)'
|
|
48
|
+
r'(?P<attrs>(?:"[^"]*"|\'[^\']*\'|(?!/>)[^>])*)'
|
|
49
|
+
r'(?P<end>/?>)$',
|
|
50
|
+
re.DOTALL,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# Finds individual attribute name + optional value within an attrs string.
|
|
54
|
+
# The hyphen is placed at the end of the second character class so it is
|
|
55
|
+
# treated as a literal rather than a range operator.
|
|
56
|
+
_ATTR_RE = re.compile(
|
|
57
|
+
r'(?P<attr_name>[a-zA-Z_:][a-zA-Z0-9_.:-]*)'
|
|
58
|
+
r'(?:\s*=\s*(?P<attr_value>"[^"]*"|\'[^\']*\'|[^\s>]*))?'
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# Names of the top-level alternatives, in the order they appear in _TOP_LEVEL_RE.
|
|
62
|
+
_TOP_LEVEL_GROUPS = ("comment", "doctype", "entity", "tag")
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# ---------------------------------------------------------------------------
|
|
66
|
+
# Public API
|
|
67
|
+
# ---------------------------------------------------------------------------
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def tokenize_html(text: str) -> List[Token]:
|
|
71
|
+
"""Return a list of :class:`Token` objects for *text*, sorted by ``start``.
|
|
72
|
+
|
|
73
|
+
Tokens cover HTML comments, DOCTYPE declarations, entity references, tag
|
|
74
|
+
brackets, tag names, attribute names, and attribute values.
|
|
75
|
+
|
|
76
|
+
The function is intentionally lenient: malformed or incomplete HTML
|
|
77
|
+
produces fewer tokens rather than raising an exception.
|
|
78
|
+
|
|
79
|
+
.. note::
|
|
80
|
+
Content inside ``<script>`` and ``<style>`` tags is currently treated
|
|
81
|
+
as plain HTML. Full script/style awareness is planned for a future
|
|
82
|
+
version.
|
|
83
|
+
"""
|
|
84
|
+
tokens: List[Token] = []
|
|
85
|
+
|
|
86
|
+
for match in _TOP_LEVEL_RE.finditer(text):
|
|
87
|
+
group = _first_matched_group(match, _TOP_LEVEL_GROUPS)
|
|
88
|
+
|
|
89
|
+
if group == "comment":
|
|
90
|
+
tokens.append(Token("comment", match.start(), match.end()))
|
|
91
|
+
|
|
92
|
+
elif group == "doctype":
|
|
93
|
+
tokens.append(Token("doctype", match.start(), match.end()))
|
|
94
|
+
|
|
95
|
+
elif group == "entity":
|
|
96
|
+
tokens.append(Token("entity", match.start(), match.end()))
|
|
97
|
+
|
|
98
|
+
elif group == "tag":
|
|
99
|
+
_tokenize_tag(text, match.start(), match.end(), tokens)
|
|
100
|
+
|
|
101
|
+
tokens.sort(key=lambda t: t.start)
|
|
102
|
+
return tokens
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def offsets_to_tkindices(text: str, offsets: List[int]) -> Dict[int, str]:
|
|
106
|
+
"""Convert character offsets into tkinter ``"line.col"`` index strings.
|
|
107
|
+
|
|
108
|
+
Returns a :class:`dict` mapping each offset to its index string. Runs
|
|
109
|
+
in *O(n + k)* where *n* is ``len(text)`` and *k* is ``len(offsets)``.
|
|
110
|
+
|
|
111
|
+
Tkinter indices are 1-based for lines and 0-based for columns:
|
|
112
|
+
offset 0 → ``"1.0"``, the position *before* the first character.
|
|
113
|
+
"""
|
|
114
|
+
if not offsets:
|
|
115
|
+
return {}
|
|
116
|
+
|
|
117
|
+
sorted_offsets = sorted(set(offsets))
|
|
118
|
+
result: Dict[int, str] = {}
|
|
119
|
+
line, col, pos = 1, 0, 0
|
|
120
|
+
|
|
121
|
+
for target in sorted_offsets:
|
|
122
|
+
while pos < target:
|
|
123
|
+
if text[pos] == "\n":
|
|
124
|
+
line += 1
|
|
125
|
+
col = 0
|
|
126
|
+
else:
|
|
127
|
+
col += 1
|
|
128
|
+
pos += 1
|
|
129
|
+
result[target] = f"{line}.{col}"
|
|
130
|
+
|
|
131
|
+
return result
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
# ---------------------------------------------------------------------------
|
|
135
|
+
# Private helpers
|
|
136
|
+
# ---------------------------------------------------------------------------
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _first_matched_group(match: re.Match, names: tuple) -> str | None:
|
|
140
|
+
"""Return the first name in *names* whose group actually matched."""
|
|
141
|
+
for name in names:
|
|
142
|
+
if match.group(name) is not None:
|
|
143
|
+
return name
|
|
144
|
+
return None
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _tokenize_tag(
|
|
148
|
+
text: str, start: int, end: int, tokens: List[Token]
|
|
149
|
+
) -> None:
|
|
150
|
+
"""Append bracket/tag_name/attr_name/attr_value tokens for a single tag."""
|
|
151
|
+
tag_str = text[start:end]
|
|
152
|
+
m = _TAG_RE.match(tag_str)
|
|
153
|
+
if m is None:
|
|
154
|
+
# Unrecognised shape — skip rather than crash.
|
|
155
|
+
return
|
|
156
|
+
|
|
157
|
+
slash = m.group("slash")
|
|
158
|
+
name = m.group("name")
|
|
159
|
+
attrs = m.group("attrs")
|
|
160
|
+
closing_bracket = m.group("end")
|
|
161
|
+
|
|
162
|
+
# Opening bracket: '<' (1 char) or '</' (2 chars)
|
|
163
|
+
prefix_len = 2 if slash else 1
|
|
164
|
+
bracket_end = start + prefix_len
|
|
165
|
+
tokens.append(Token("bracket", start, bracket_end))
|
|
166
|
+
|
|
167
|
+
# Tag name
|
|
168
|
+
name_end = bracket_end + len(name)
|
|
169
|
+
tokens.append(Token("tag_name", bracket_end, name_end))
|
|
170
|
+
|
|
171
|
+
# Attributes (offsets relative to start of `attrs` within `text`)
|
|
172
|
+
attrs_offset = name_end
|
|
173
|
+
for attr_m in _ATTR_RE.finditer(attrs):
|
|
174
|
+
tokens.append(Token(
|
|
175
|
+
"attr_name",
|
|
176
|
+
attrs_offset + attr_m.start("attr_name"),
|
|
177
|
+
attrs_offset + attr_m.end("attr_name"),
|
|
178
|
+
))
|
|
179
|
+
if attr_m.group("attr_value") is not None:
|
|
180
|
+
tokens.append(Token(
|
|
181
|
+
"attr_value",
|
|
182
|
+
attrs_offset + attr_m.start("attr_value"),
|
|
183
|
+
attrs_offset + attr_m.end("attr_value"),
|
|
184
|
+
))
|
|
185
|
+
|
|
186
|
+
# Closing bracket: '>' or '/>'
|
|
187
|
+
close_start = end - len(closing_bracket)
|
|
188
|
+
tokens.append(Token("bracket", close_start, end))
|