justhtml 0.21.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. justhtml-0.21.0/.github/copilot-instructions.md +81 -0
  2. justhtml-0.21.0/.github/workflows/ci.yml +105 -0
  3. justhtml-0.21.0/.github/workflows/publish.yml +34 -0
  4. justhtml-0.21.0/.gitignore +174 -0
  5. justhtml-0.21.0/.pre-commit-config.yaml +33 -0
  6. justhtml-0.21.0/CODE_OF_CONDUCT.md +128 -0
  7. justhtml-0.21.0/CONTRIBUTING.md +109 -0
  8. justhtml-0.21.0/LICENSE +24 -0
  9. justhtml-0.21.0/PKG-INFO +143 -0
  10. justhtml-0.21.0/README.md +114 -0
  11. justhtml-0.21.0/benchmarks/correctness.py +1005 -0
  12. justhtml-0.21.0/benchmarks/fuzz.py +2204 -0
  13. justhtml-0.21.0/benchmarks/performance.py +746 -0
  14. justhtml-0.21.0/benchmarks/profile.py +72 -0
  15. justhtml-0.21.0/docs/api.md +341 -0
  16. justhtml-0.21.0/docs/cli.md +101 -0
  17. justhtml-0.21.0/docs/correctness.md +225 -0
  18. justhtml-0.21.0/docs/encoding.md +94 -0
  19. justhtml-0.21.0/docs/errors.md +263 -0
  20. justhtml-0.21.0/docs/fragments.md +231 -0
  21. justhtml-0.21.0/docs/index.md +112 -0
  22. justhtml-0.21.0/docs/quickstart.md +179 -0
  23. justhtml-0.21.0/docs/sanitization.md +267 -0
  24. justhtml-0.21.0/docs/selectors.md +149 -0
  25. justhtml-0.21.0/docs/streaming.md +158 -0
  26. justhtml-0.21.0/docs/text.md +96 -0
  27. justhtml-0.21.0/pyproject.toml +104 -0
  28. justhtml-0.21.0/run_tests.py +352 -0
  29. justhtml-0.21.0/src/justhtml/__init__.py +23 -0
  30. justhtml-0.21.0/src/justhtml/__main__.py +170 -0
  31. justhtml-0.21.0/src/justhtml/constants.py +445 -0
  32. justhtml-0.21.0/src/justhtml/context.py +12 -0
  33. justhtml-0.21.0/src/justhtml/encoding.py +405 -0
  34. justhtml-0.21.0/src/justhtml/entities.py +382 -0
  35. justhtml-0.21.0/src/justhtml/errors.py +149 -0
  36. justhtml-0.21.0/src/justhtml/node.py +779 -0
  37. justhtml-0.21.0/src/justhtml/parser.py +170 -0
  38. justhtml-0.21.0/src/justhtml/py.typed +0 -0
  39. justhtml-0.21.0/src/justhtml/sanitize.py +741 -0
  40. justhtml-0.21.0/src/justhtml/selector.py +1061 -0
  41. justhtml-0.21.0/src/justhtml/serialize.py +560 -0
  42. justhtml-0.21.0/src/justhtml/stream.py +107 -0
  43. justhtml-0.21.0/src/justhtml/tokenizer.py +2662 -0
  44. justhtml-0.21.0/src/justhtml/tokens.py +223 -0
  45. justhtml-0.21.0/src/justhtml/treebuilder.py +1290 -0
  46. justhtml-0.21.0/src/justhtml/treebuilder_modes.py +2072 -0
  47. justhtml-0.21.0/src/justhtml/treebuilder_utils.py +93 -0
  48. justhtml-0.21.0/test-summary.txt +106 -0
  49. justhtml-0.21.0/tests/README.md +31 -0
  50. justhtml-0.21.0/tests/__init__.py +0 -0
  51. justhtml-0.21.0/tests/data/wikipedia.html +1016 -0
  52. justhtml-0.21.0/tests/harness/__init__.py +4 -0
  53. justhtml-0.21.0/tests/harness/encoding.py +138 -0
  54. justhtml-0.21.0/tests/harness/regressions.py +82 -0
  55. justhtml-0.21.0/tests/harness/reporter.py +135 -0
  56. justhtml-0.21.0/tests/harness/serializer.py +543 -0
  57. justhtml-0.21.0/tests/harness/tokenizer.py +351 -0
  58. justhtml-0.21.0/tests/harness/tree.py +398 -0
  59. justhtml-0.21.0/tests/justhtml-sanitize-tests/cases.json +518 -0
  60. justhtml-0.21.0/tests/justhtml-tests/branch_coverage.dat +450 -0
  61. justhtml-0.21.0/tests/justhtml-tests/coverage_gaps.test +49 -0
  62. justhtml-0.21.0/tests/justhtml-tests/empty_stack_edge_cases.dat +63 -0
  63. justhtml-0.21.0/tests/justhtml-tests/entities.test +47 -0
  64. justhtml-0.21.0/tests/justhtml-tests/iframe_srcdoc.dat +13 -0
  65. justhtml-0.21.0/tests/justhtml-tests/tokenizer_edge_cases.test +217 -0
  66. justhtml-0.21.0/tests/justhtml-tests/treebuilder_coverage.dat +253 -0
  67. justhtml-0.21.0/tests/justhtml-tests/xml_coercion.dat +53 -0
  68. justhtml-0.21.0/tests/justhtml-tests/xml_coercion_coverage.test +14 -0
  69. justhtml-0.21.0/tests/test_cli.py +172 -0
  70. justhtml-0.21.0/tests/test_coverage.py +128 -0
  71. justhtml-0.21.0/tests/test_docs_examples.py +252 -0
  72. justhtml-0.21.0/tests/test_encoding.py +119 -0
  73. justhtml-0.21.0/tests/test_errors.py +420 -0
  74. justhtml-0.21.0/tests/test_node.py +606 -0
  75. justhtml-0.21.0/tests/test_precommit_coverage.py +76 -0
  76. justhtml-0.21.0/tests/test_sanitize.py +215 -0
  77. justhtml-0.21.0/tests/test_sanitize_integration.py +112 -0
  78. justhtml-0.21.0/tests/test_selector.py +1703 -0
  79. justhtml-0.21.0/tests/test_serialize.py +557 -0
  80. justhtml-0.21.0/tests/test_stream.py +65 -0
  81. justhtml-0.21.0/tests/test_wikipedia.py +83 -0
@@ -0,0 +1,81 @@
1
+ ## JustHTML – Agent instructions
2
+
3
+ # Decision & Clarification Policy (Overrides)
4
+
5
+ - Replace "propose a follow-up" with "propose **and execute** the best alternative by default; ask only for destructive/irreversible choices."
6
+ - Keep preambles to a single declarative sentence ("I'm scanning the repo and then drafting a minimal fix.") — no approval requests.
7
+
8
+ ### Architecture Snapshot
9
+ - Tokenizer (`tokenizer.py`): HTML5 spec state machine (~60 states). Handles RCDATA, RAWTEXT, CDATA, script escaping, comments, DOCTYPE, etc.
10
+ - Tree builder (`treebuilder.py`): Token sink that constructs DOM tree following HTML5 construction rules.
11
+ - Node tree (`node.py`): DOM-like structure. Always use `append_child()` / `insert_before()` for tree operations.
12
+ - Entities (`entities.py`): HTML5 character reference decoding (named & numeric entities).
13
+ - Constants (`constants.py`): HTML5 element categories, void elements, formatting elements, etc.
14
+
15
+ ### Golden Rules
16
+ 1. **Spec compliance first**: Follow WHATWG HTML5 spec exactly. No heuristics, no shortcuts.
17
+ 2. **No exceptions in hot paths**: Use deterministic control flow, not try/except for branching.
18
+ 3. **No reflective probing**: No `hasattr`, `getattr`, or `delattr` - all data structures used are deterministic.
19
+ 4. **Minimal allocations**: Reuse buffers, avoid per-token object creation in tokenizer.
20
+ 5. **Token reuse**: Create new token objects when emitting (don't reuse references).
21
+ 6. **State machine purity**: Tokenizer state transitions follow spec state machine exactly.
22
+ 7. **No test-specific code**: No references to test files in comments or code.
23
+
24
+ ### Testing Workflow
25
+ 1. **Target failures**: Use `--test-specs file:indices` to run specific tests
26
+ ```bash
27
+ python run_tests.py --test-specs test2.test:5,10 -v
28
+ ```
29
+
30
+ 2. **Check test output**: Use `-v` for diffs, `-vv` for debug output
31
+ ```bash
32
+ python run_tests.py --test-specs test3.test -vv
33
+ ```
34
+
35
+ 3. **Run full suite**: Always check for regressions
36
+ ```bash
37
+ python run_tests.py -q # Quick overview
38
+ python run_tests.py --regressions # Check for new failures vs baseline
39
+ ```
40
+
41
+ 4. **Quick iteration**: Test snippet without full suite (full suite runs in ~1s)
42
+ ```bash
43
+ python -c 'from justhtml import JustHTML, to_test_format; print(to_test_format(JustHTML("<html>").root))'
44
+ ```
45
+
46
+ 5. **Benchmark performance**: After changes, verify speed impact
47
+ ```bash
48
+ python benchmarks/performance.py --iterations 1 --parser justhtml --no-mem
49
+ ```
50
+
51
+ 6. **Profile hotspots**: For performance optimization
52
+ ```bash
53
+ python benchmarks/profile.py # Profiles on web100k dataset
54
+ ```
55
+
56
+ ### Test Runner Flags
57
+ - `--test-specs FILE[:INDICES]`: Run specific test(s), e.g., `test2.test:5,10` or `tests1.dat`
58
+ - `-v, -vv, -vvv`: Verbosity (diffs, debug output, full debug)
59
+ - `-q, --quiet`: Summary only
60
+ - `-x, --fail-fast`: Stop on first failure
61
+ - `--regressions`: Compare against HEAD baseline
62
+ - `--exclude-files`, `--exclude-errors`, `--exclude-html`: Skip tests matching patterns
63
+ - `--filter-errors`, `--filter-html`: Only run tests matching patterns
64
+
65
+ ### Benchmark Flags (benchmarks/performance.py)
66
+ - `--iterations 1`: Single run (default: 5 for averaging)
67
+ - `--parser justhtml`: Benchmark only JustHTML (default: all parsers)
68
+ - `--no-mem`: Disable memory profiling (faster)
69
+ - `--limit N`: Test on N files (default: 100)
70
+
71
+ ### Logging & Comments
72
+ - Comments explain **why** (spec rationale), not **what** (code is self-documenting)
73
+ - Cite spec sections when relevant (e.g., "Per §13.2.5.72")
74
+ - No historical notes ("Previously", "Fixed", "Changed") - prefer removing old code
75
+ - Debug calls: `self.debug()` / `parser.debug()` - no gating needed
76
+
77
+ ### Performance Mindset
78
+ - Tokenizer is hot path: minimize allocations, avoid string slicing
79
+ - Use `str.find()` for scanning, not regex when possible
80
+ - Reuse buffers: `text_buffer`, `current_tag_name`, etc.
81
+ - Infer state from structure (stacks, tree) instead of storing flags
@@ -0,0 +1,105 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.10", "3.11", "3.12", "3.13", "3.14", "3.15-dev", "pypy3.11"]
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Set up Python ${{ matrix.python-version }}
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: ${{ matrix.python-version }}
23
+ cache: "pip"
24
+
25
+ - name: Install dependencies
26
+ run: |
27
+ python -m pip install --upgrade pip
28
+ pip install -e ".[dev]"
29
+
30
+ - name: Checkout html5lib-tests
31
+ run: |
32
+ cd ..
33
+ git clone https://github.com/html5lib/html5lib-tests.git
34
+
35
+ - name: Setup test symlinks
36
+ run: |
37
+ cd tests
38
+ ln -s ../../html5lib-tests/tokenizer html5lib-tests-tokenizer
39
+ ln -s ../../html5lib-tests/tree-construction html5lib-tests-tree
40
+ ln -s ../../html5lib-tests/serializer html5lib-tests-serializer
41
+ ln -s ../../html5lib-tests/encoding html5lib-tests-encoding
42
+
43
+ - name: Run pre-commit
44
+ uses: pre-commit/action@v3.0.1
45
+ env:
46
+ SKIP: mypy
47
+
48
+ - name: Run tests
49
+ run: python run_tests.py
50
+
51
+ - name: Run mypy
52
+ if: matrix.python-version == '3.12'
53
+ run: mypy
54
+
55
+ test-pyodide:
56
+ runs-on: ubuntu-latest
57
+ steps:
58
+ - uses: actions/checkout@v4
59
+
60
+ - name: Set up Node.js
61
+ uses: actions/setup-node@v4
62
+ with:
63
+ node-version: "20"
64
+
65
+ - name: Set up Python
66
+ uses: actions/setup-python@v5
67
+ with:
68
+ python-version: "3.12"
69
+
70
+ - name: Build wheel
71
+ run: |
72
+ pip install build
73
+ python -m build --wheel
74
+
75
+ - name: Test in Pyodide
76
+ run: |
77
+ npm install pyodide
78
+ cat > test_pyodide.js << 'SCRIPT'
79
+ const { loadPyodide } = require("pyodide");
80
+ const fs = require("fs");
81
+ const path = require("path");
82
+
83
+ async function main() {
84
+ const pyodide = await loadPyodide();
85
+ await pyodide.loadPackage("micropip");
86
+ const micropip = pyodide.pyimport("micropip");
87
+
88
+ // Find the wheel and install via file URL
89
+ const wheel = fs.readdirSync("dist").find(f => f.endsWith(".whl"));
90
+ const wheelPath = path.resolve("dist", wheel);
91
+ await micropip.install("file://" + wheelPath);
92
+
93
+ // Test basic parsing
94
+ const result = pyodide.runPython(
95
+ 'from justhtml import JustHTML; doc = JustHTML("<html><body><p>Hello</p></body></html>"); doc.root.children[0].name'
96
+ );
97
+ if (result !== "html") {
98
+ throw new Error("Expected 'html', got '" + result + "'");
99
+ }
100
+ console.log("Pyodide test passed!");
101
+ }
102
+
103
+ main().catch(e => { console.error(e); process.exit(1); });
104
+ SCRIPT
105
+ node test_pyodide.js
@@ -0,0 +1,34 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+ workflow_dispatch:
7
+
8
+ jobs:
9
+ pypi-publish:
10
+ name: Upload release to PyPI
11
+ runs-on: ubuntu-latest
12
+ environment:
13
+ name: pypi
14
+ url: https://pypi.org/p/justhtml
15
+ permissions:
16
+ id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
17
+ contents: read
18
+ steps:
19
+ - name: Checkout
20
+ uses: actions/checkout@v4
21
+
22
+ - name: Set up Python
23
+ uses: actions/setup-python@v5
24
+ with:
25
+ python-version: "3.x"
26
+
27
+ - name: Install build dependencies
28
+ run: python -m pip install build
29
+
30
+ - name: Build package
31
+ run: python -m build
32
+
33
+ - name: Publish package distributions to PyPI
34
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,174 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ coverage.json
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+ cover/
54
+
55
+ # Translations
56
+ *.mo
57
+ *.pot
58
+
59
+ # Django stuff:
60
+ *.log
61
+ local_settings.py
62
+ db.sqlite3
63
+ db.sqlite3-journal
64
+
65
+ # Flask stuff:
66
+ instance/
67
+ .webassets-cache
68
+
69
+ # Scrapy stuff:
70
+ .scrapy
71
+
72
+ # Sphinx documentation
73
+ docs/_build/
74
+
75
+ # PyBuilder
76
+ .pybuilder/
77
+ target/
78
+
79
+ # Jupyter Notebook
80
+ .ipynb_checkpoints
81
+
82
+ # IPython
83
+ profile_default/
84
+ ipython_config.py
85
+
86
+ # pyenv
87
+ # For a library or package, you might want to ignore these files since the code is
88
+ # intended to run in multiple environments; otherwise, check them in:
89
+ # .python-version
90
+
91
+ # pipenv
92
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
93
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
94
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
95
+ # install all needed dependencies.
96
+ #Pipfile.lock
97
+
98
+ # UV
99
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
100
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
101
+ # commonly ignored for libraries.
102
+ #uv.lock
103
+
104
+ # poetry
105
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
106
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
107
+ # commonly ignored for libraries.
108
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
109
+ #poetry.lock
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ #pdm.lock
114
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
115
+ # in version control.
116
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
117
+ .pdm.toml
118
+ .pdm-python
119
+ .pdm-build/
120
+
121
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
122
+ __pypackages__/
123
+
124
+ # Celery stuff
125
+ celerybeat-schedule
126
+ celerybeat.pid
127
+
128
+ # SageMath parsed files
129
+ *.sage.py
130
+
131
+ # Environments
132
+ .env
133
+ .venv
134
+ env/
135
+ venv/
136
+ ENV/
137
+ env.bak/
138
+ venv.bak/
139
+
140
+ # Spyder project settings
141
+ .spyderproject
142
+ .spyproject
143
+
144
+ # Rope project settings
145
+ .ropeproject
146
+
147
+ # mkdocs documentation
148
+ /site
149
+
150
+ # mypy
151
+ .mypy_cache/
152
+ .dmypy.json
153
+ dmypy.json
154
+
155
+ # Pyre type checker
156
+ .pyre/
157
+
158
+ # pytype static type analyzer
159
+ .pytype/
160
+
161
+ # Cython debug symbols
162
+ cython_debug/
163
+
164
+ # PyCharm
165
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
166
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
167
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
168
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
169
+ #.idea/
170
+
171
+ # PyPI configuration file
172
+ .pypirc
173
+ .python-version
174
+ tests/html5lib-tests-*
@@ -0,0 +1,33 @@
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v6.0.0
4
+ hooks:
5
+ - id: trailing-whitespace
6
+ exclude: '\.dat$'
7
+ - id: end-of-file-fixer
8
+ - id: check-yaml
9
+ - id: check-added-large-files
10
+ - id: check-toml
11
+
12
+ - repo: https://github.com/astral-sh/ruff-pre-commit
13
+ rev: v0.14.7
14
+ hooks:
15
+ - id: ruff-check
16
+ args: [--fix]
17
+ - id: ruff-format
18
+
19
+ - repo: https://github.com/pre-commit/mirrors-mypy
20
+ rev: v1.19.1
21
+ hooks:
22
+ - id: mypy
23
+ args: [--config-file=pyproject.toml]
24
+ files: ^src/justhtml/
25
+
26
+ - repo: local
27
+ hooks:
28
+ - id: tests-coverage
29
+ name: Tests & Coverage
30
+ entry: bash -c 'coverage run run_tests.py && coverage report'
31
+ language: system
32
+ pass_filenames: false
33
+ always_run: true
@@ -0,0 +1,128 @@
1
+ # Contributor Covenant Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ We as members, contributors, and leaders pledge to make participation in our
6
+ community a harassment-free experience for everyone, regardless of age, body
7
+ size, visible or invisible disability, ethnicity, sex characteristics, gender
8
+ identity and expression, level of experience, education, socio-economic status,
9
+ nationality, personal appearance, race, religion, or sexual identity
10
+ and orientation.
11
+
12
+ We pledge to act and interact in ways that contribute to an open, welcoming,
13
+ diverse, inclusive, and healthy community.
14
+
15
+ ## Our Standards
16
+
17
+ Examples of behavior that contributes to a positive environment for our
18
+ community include:
19
+
20
+ * Demonstrating empathy and kindness toward other people
21
+ * Being respectful of differing opinions, viewpoints, and experiences
22
+ * Giving and gracefully accepting constructive feedback
23
+ * Accepting responsibility and apologizing to those affected by our mistakes,
24
+ and learning from the experience
25
+ * Focusing on what is best not just for us as individuals, but for the
26
+ overall community
27
+
28
+ Examples of unacceptable behavior include:
29
+
30
+ * The use of sexualized language or imagery, and sexual attention or
31
+ advances of any kind
32
+ * Trolling, insulting or derogatory comments, and personal or political attacks
33
+ * Public or private harassment
34
+ * Publishing others' private information, such as a physical or email
35
+ address, without their explicit permission
36
+ * Other conduct which could reasonably be considered inappropriate in a
37
+ professional setting
38
+
39
+ ## Enforcement Responsibilities
40
+
41
+ Community leaders are responsible for clarifying and enforcing our standards of
42
+ acceptable behavior and will take appropriate and fair corrective action in
43
+ response to any behavior that they deem inappropriate, threatening, offensive,
44
+ or harmful.
45
+
46
+ Community leaders have the right and responsibility to remove, edit, or reject
47
+ comments, commits, code, wiki edits, issues, and other contributions that are
48
+ not aligned to this Code of Conduct, and will communicate reasons for moderation
49
+ decisions when appropriate.
50
+
51
+ ## Scope
52
+
53
+ This Code of Conduct applies within all community spaces, and also applies when
54
+ an individual is officially representing the community in public spaces.
55
+ Examples of representing our community include using an official e-mail address,
56
+ posting via an official social media account, or acting as an appointed
57
+ representative at an online or offline event.
58
+
59
+ ## Enforcement
60
+
61
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
62
+ reported to the community leaders responsible for enforcement at
63
+ emil@emilstenstrom.se.
64
+ All complaints will be reviewed and investigated promptly and fairly.
65
+
66
+ All community leaders are obligated to respect the privacy and security of the
67
+ reporter of any incident.
68
+
69
+ ## Enforcement Guidelines
70
+
71
+ Community leaders will follow these Community Impact Guidelines in determining
72
+ the consequences for any action they deem in violation of this Code of Conduct:
73
+
74
+ ### 1. Correction
75
+
76
+ **Community Impact**: Use of inappropriate language or other behavior deemed
77
+ unprofessional or unwelcome in the community.
78
+
79
+ **Consequence**: A private, written warning from community leaders, providing
80
+ clarity around the nature of the violation and an explanation of why the
81
+ behavior was inappropriate. A public apology may be requested.
82
+
83
+ ### 2. Warning
84
+
85
+ **Community Impact**: A violation through a single incident or series
86
+ of actions.
87
+
88
+ **Consequence**: A warning with consequences for continued behavior. No
89
+ interaction with the people involved, including unsolicited interaction with
90
+ those enforcing the Code of Conduct, for a specified period of time. This
91
+ includes avoiding interactions in community spaces as well as external channels
92
+ like social media. Violating these terms may lead to a temporary or
93
+ permanent ban.
94
+
95
+ ### 3. Temporary Ban
96
+
97
+ **Community Impact**: A serious violation of community standards, including
98
+ sustained inappropriate behavior.
99
+
100
+ **Consequence**: A temporary ban from any sort of interaction or public
101
+ communication with the community for a specified period of time. No public or
102
+ private interaction with the people involved, including unsolicited interaction
103
+ with those enforcing the Code of Conduct, is allowed during this period.
104
+ Violating these terms may lead to a permanent ban.
105
+
106
+ ### 4. Permanent Ban
107
+
108
+ **Community Impact**: Demonstrating a pattern of violation of community
109
+ standards, including sustained inappropriate behavior, harassment of an
110
+ individual, or aggression toward or disparagement of classes of individuals.
111
+
112
+ **Consequence**: A permanent ban from any sort of public interaction within
113
+ the community.
114
+
115
+ ## Attribution
116
+
117
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118
+ version 2.0, available at
119
+ https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
120
+
121
+ Community Impact Guidelines were inspired by [Mozilla's code of conduct
122
+ enforcement ladder](https://github.com/mozilla/diversity).
123
+
124
+ [homepage]: https://www.contributor-covenant.org
125
+
126
+ For answers to common questions about this code of conduct, see the FAQ at
127
+ https://www.contributor-covenant.org/faq. Translations are available at
128
+ https://www.contributor-covenant.org/translations.
@@ -0,0 +1,109 @@
1
+ # Contributing to JustHTML
2
+
3
+ Thanks for considering contributing to JustHTML! This document explains how to set up your development environment and the standards we follow.
4
+
5
+ ## Development Setup
6
+
7
+ 1. Clone the repository:
8
+ ```bash
9
+ git clone https://github.com/emilstenstrom/justhtml.git
10
+ cd justhtml
11
+ ```
12
+
13
+ 2. Create a virtual environment and install dev dependencies:
14
+ ```bash
15
+ python -m venv .venv
16
+ source .venv/bin/activate
17
+ pip install -e ".[dev]"
18
+ ```
19
+
20
+ 3. Install pre-commit hooks:
21
+ ```bash
22
+ pre-commit install
23
+ ```
24
+
25
+ ## Running Tests
26
+
27
+ The test suite uses the html5lib test cases plus additional tests for selector functionality.
28
+
29
+ If you want to run the full html5lib test suite locally, clone `html5lib-tests` next to this repository and create the symlinks described in [tests/README.md](tests/README.md) (tokenizer, tree-construction, and serializer).
30
+
31
+ ```bash
32
+ # Run all tests
33
+ python run_tests.py
34
+
35
+ # Run with coverage report
36
+ coverage run run_tests.py && coverage report
37
+
38
+ # Run specific test file
39
+ python run_tests.py --test-specs test2.test:5,10 -v
40
+
41
+ # Quick iteration - test a snippet
42
+ python -c 'from justhtml import JustHTML, to_test_format; print(to_test_format(JustHTML("<html>").root))'
43
+ ```
44
+
45
+ **Coverage is required to be 100%.** All new code must be fully tested.
46
+
47
+ ## Pre-commit Hooks
48
+
49
+ Pre-commit runs automatically on every commit and checks:
50
+
51
+ - **Trailing whitespace** and **end-of-file** formatting
52
+ - **YAML** and **TOML** validity
53
+ - **Ruff check** - linting with auto-fix
54
+ - **Ruff format** - code formatting
55
+ - **Tests & Coverage** - full test suite with 100% coverage requirement
56
+
57
+ Run manually:
58
+ ```bash
59
+ pre-commit run --all-files
60
+ ```
61
+
62
+ ## Code Style
63
+
64
+ We use [Ruff](https://docs.astral.sh/ruff/) for linting and formatting:
65
+
66
+ - **Line length**: 119 characters
67
+ - **Target**: Python 3.10+
68
+ - **Rules**: Nearly all Ruff rules enabled (see `pyproject.toml` for exceptions)
69
+
70
+ Key style points:
71
+ - Use plain `assert` for tests, not `self.assertEqual` etc.
72
+ - Comments explain **why**, not **what**
73
+ - No typing annotations
74
+ - Cite spec sections when relevant (e.g., "Per §13.2.5.72")
75
+
76
+ ## Benchmarking
77
+
78
+ After making changes, verify performance impact:
79
+
80
+ ```bash
81
+ # Quick benchmark
82
+ python benchmarks/performance.py --iterations 1 --parser justhtml --no-mem
83
+
84
+ # Profile hotspots
85
+ python benchmarks/profile.py
86
+ ```
87
+
88
+ ## Architecture Notes
89
+
90
+ - **Tokenizer** (`tokenizer.py`): HTML5 spec state machine
91
+ - **Tree builder** (`treebuilder.py`): Constructs DOM tree following HTML5 rules
92
+ - **Node tree** (`node.py`): DOM-like structure, use `append_child()` / `insert_before()`
93
+ - **Selector** (`selector.py`): CSS selector matching
94
+
95
+ Golden rules:
96
+ 1. Follow WHATWG HTML5 spec exactly
97
+ 2. No exceptions in hot paths
98
+ 3. Minimal allocations in tokenizer
99
+ 4. No `hasattr`/`getattr`/`delattr` - all structures are deterministic
100
+
101
+ ## Submitting Changes
102
+
103
+ 1. Fork the repository
104
+ 2. Create a feature branch
105
+ 3. Make your changes with tests
106
+ 4. Ensure pre-commit passes
107
+ 5. Submit a pull request
108
+
109
+ Questions? Open an issue on GitHub.
@@ -0,0 +1,24 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Emil Stenström (JustHTML)
4
+ Copyright (c) 2014-2017, The html5ever Project Developers (html5ever inspiration)
5
+ Copyright (c) 2006-2013 James Graham, Geoffrey Sneddon, and
6
+ other contributors (html5lib-tests)
7
+
8
+ Permission is hereby granted, free of charge, to any person obtaining a copy
9
+ of this software and associated documentation files (the "Software"), to deal
10
+ in the Software without restriction, including without limitation the rights
11
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
+ copies of the Software, and to permit persons to whom the Software is
13
+ furnished to do so, subject to the following conditions:
14
+
15
+ The above copyright notice and this permission notice shall be included in all
16
+ copies or substantial portions of the Software.
17
+
18
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24
+ SOFTWARE.