play-parser 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- play_parser-1.0.0/CHANGELOG.md +16 -0
- play_parser-1.0.0/LICENSE +21 -0
- play_parser-1.0.0/MANIFEST.in +5 -0
- play_parser-1.0.0/PKG-INFO +195 -0
- play_parser-1.0.0/README.md +162 -0
- play_parser-1.0.0/RELEASE.md +62 -0
- play_parser-1.0.0/SECURITY.md +11 -0
- play_parser-1.0.0/data/README.md +7 -0
- play_parser-1.0.0/data/json/optimal/A Midsummer Night's Dream.json +5008 -0
- play_parser-1.0.0/data/json/optimal/All's Well That Ends Well.json +8665 -0
- play_parser-1.0.0/data/json/optimal/Antony and Cleopatra.json +11638 -0
- play_parser-1.0.0/data/json/optimal/As You Like It.json +7585 -0
- play_parser-1.0.0/data/json/optimal/Coriolanus.json +10572 -0
- play_parser-1.0.0/data/json/optimal/Cymbeline.json +8624 -0
- play_parser-1.0.0/data/json/optimal/Hamlet.json +11084 -0
- play_parser-1.0.0/data/json/optimal/Henry IV, Part 1.json +7418 -0
- play_parser-1.0.0/data/json/optimal/Henry IV, Part 2.json +8115 -0
- play_parser-1.0.0/data/json/optimal/Henry V.json +6956 -0
- play_parser-1.0.0/data/json/optimal/Henry VI, Part 1.json +6730 -0
- play_parser-1.0.0/data/json/optimal/Henry VI, Part 2.json +7961 -0
- play_parser-1.0.0/data/json/optimal/Henry VI, Part 3.json +7962 -0
- play_parser-1.0.0/data/json/optimal/Henry VIII.json +6823 -0
- play_parser-1.0.0/data/json/optimal/Julius Caesar.json +7722 -0
- play_parser-1.0.0/data/json/optimal/King John.json +5252 -0
- play_parser-1.0.0/data/json/optimal/King Lear.json +10634 -0
- play_parser-1.0.0/data/json/optimal/Love's Labour's Lost.json +9496 -0
- play_parser-1.0.0/data/json/optimal/Macbeth.json +6730 -0
- play_parser-1.0.0/data/json/optimal/Measure for Measure.json +8557 -0
- play_parser-1.0.0/data/json/optimal/Much Ado About Nothing.json +8691 -0
- play_parser-1.0.0/data/json/optimal/Othello.json +11014 -0
- play_parser-1.0.0/data/json/optimal/Pericles, Prince of Tyre.json +6400 -0
- play_parser-1.0.0/data/json/optimal/README.md +7 -0
- play_parser-1.0.0/data/json/optimal/Richard II.json +5422 -0
- play_parser-1.0.0/data/json/optimal/Richard III.json +10683 -0
- play_parser-1.0.0/data/json/optimal/Romeo and Juliet.json +8273 -0
- play_parser-1.0.0/data/json/optimal/The Comedy of Errors.json +5568 -0
- play_parser-1.0.0/data/json/optimal/The Importance of Being Earnest.json +6213 -0
- play_parser-1.0.0/data/json/optimal/The Merchant of Venice.json +6115 -0
- play_parser-1.0.0/data/json/optimal/The Merry Wives of Windsor.json +9801 -0
- play_parser-1.0.0/data/json/optimal/The Taming of the Shrew.json +8497 -0
- play_parser-1.0.0/data/json/optimal/The Tempest.json +6235 -0
- play_parser-1.0.0/data/json/optimal/The Winter's Tale.json +7037 -0
- play_parser-1.0.0/data/json/optimal/Timon of Athens.json +7379 -0
- play_parser-1.0.0/data/json/optimal/Titus Andronicus.json +5876 -0
- play_parser-1.0.0/data/json/optimal/Troilus and Cressida.json +7458 -0
- play_parser-1.0.0/data/json/optimal/Twelfth Night.json +8647 -0
- play_parser-1.0.0/data/json/optimal/Two Gentlemen of Verona.json +7751 -0
- play_parser-1.0.0/data/text/raw/A Midsummer Night's Dream.txt +2806 -0
- play_parser-1.0.0/data/text/raw/All's Well That Ends Well.txt +4026 -0
- play_parser-1.0.0/data/text/raw/Antony and Cleopatra.txt +5042 -0
- play_parser-1.0.0/data/text/raw/As You Like It.txt +3639 -0
- play_parser-1.0.0/data/text/raw/Coriolanus.txt +5103 -0
- play_parser-1.0.0/data/text/raw/Cymbeline.txt +4823 -0
- play_parser-1.0.0/data/text/raw/Hamlet.txt +5387 -0
- play_parser-1.0.0/data/text/raw/Henry IV, Part 1.txt +3988 -0
- play_parser-1.0.0/data/text/raw/Henry IV, Part 2.txt +4268 -0
- play_parser-1.0.0/data/text/raw/Henry V.txt +4132 -0
- play_parser-1.0.0/data/text/raw/Henry VI, Part 1.txt +3636 -0
- play_parser-1.0.0/data/text/raw/Henry VI, Part 2.txt +4131 -0
- play_parser-1.0.0/data/text/raw/Henry VI, Part 3.txt +3957 -0
- play_parser-1.0.0/data/text/raw/Henry VIII.txt +4095 -0
- play_parser-1.0.0/data/text/raw/Julius Caesar.txt +3574 -0
- play_parser-1.0.0/data/text/raw/King John.txt +3323 -0
- play_parser-1.0.0/data/text/raw/King Lear.txt +4839 -0
- play_parser-1.0.0/data/text/raw/Love's Labour's Lost.txt +4038 -0
- play_parser-1.0.0/data/text/raw/Macbeth.txt +3242 -0
- play_parser-1.0.0/data/text/raw/Measure for Measure.txt +3896 -0
- play_parser-1.0.0/data/text/raw/Much Ado About Nothing.txt +3689 -0
- play_parser-1.0.0/data/text/raw/Othello.txt +4883 -0
- play_parser-1.0.0/data/text/raw/Pericles, Prince of Tyre.txt +3228 -0
- play_parser-1.0.0/data/text/raw/Richard II.txt +3498 -0
- play_parser-1.0.0/data/text/raw/Richard III.txt +5037 -0
- play_parser-1.0.0/data/text/raw/Romeo and Juliet.txt +4143 -0
- play_parser-1.0.0/data/text/raw/The Comedy of Errors.txt +2666 -0
- play_parser-1.0.0/data/text/raw/The Importance of Being Earnest.txt +3841 -0
- play_parser-1.0.0/data/text/raw/The Merchant of Venice.txt +3445 -0
- play_parser-1.0.0/data/text/raw/The Merry Wives of Windsor.txt +3856 -0
- play_parser-1.0.0/data/text/raw/The Taming of the Shrew.txt +3705 -0
- play_parser-1.0.0/data/text/raw/The Tempest.txt +3049 -0
- play_parser-1.0.0/data/text/raw/The Winter's Tale.txt +4240 -0
- play_parser-1.0.0/data/text/raw/Timon of Athens.txt +3381 -0
- play_parser-1.0.0/data/text/raw/Titus Andronicus.txt +3298 -0
- play_parser-1.0.0/data/text/raw/Troilus and Cressida.txt +3539 -0
- play_parser-1.0.0/data/text/raw/Twelfth Night.txt +3578 -0
- play_parser-1.0.0/data/text/raw/Two Gentlemen of Verona.txt +3221 -0
- play_parser-1.0.0/docs/API.md +207 -0
- play_parser-1.0.0/docs/FORMAT_PROFILES.md +196 -0
- play_parser-1.0.0/docs/JSON_SCHEMA.md +150 -0
- play_parser-1.0.0/docs/play_document.schema.json +241 -0
- play_parser-1.0.0/pyproject.toml +69 -0
- play_parser-1.0.0/setup.cfg +4 -0
- play_parser-1.0.0/src/play_parser/__init__.py +32 -0
- play_parser-1.0.0/src/play_parser/__main__.py +6 -0
- play_parser-1.0.0/src/play_parser/_io.py +51 -0
- play_parser-1.0.0/src/play_parser/cli/__init__.py +3 -0
- play_parser-1.0.0/src/play_parser/cli/main.py +240 -0
- play_parser-1.0.0/src/play_parser/document/__init__.py +21 -0
- play_parser-1.0.0/src/play_parser/document/assembler.py +185 -0
- play_parser-1.0.0/src/play_parser/document/builder.py +170 -0
- play_parser-1.0.0/src/play_parser/document/constants.py +17 -0
- play_parser-1.0.0/src/play_parser/document/text.py +15 -0
- play_parser-1.0.0/src/play_parser/document/types.py +92 -0
- play_parser-1.0.0/src/play_parser/document/validation.py +250 -0
- play_parser-1.0.0/src/play_parser/domain/__init__.py +11 -0
- play_parser-1.0.0/src/play_parser/domain/play.py +743 -0
- play_parser-1.0.0/src/play_parser/ingestion/__init__.py +3 -0
- play_parser-1.0.0/src/play_parser/ingestion/ingestor.py +181 -0
- play_parser-1.0.0/src/play_parser/parsing/__init__.py +18 -0
- play_parser-1.0.0/src/play_parser/parsing/context.py +103 -0
- play_parser-1.0.0/src/play_parser/parsing/front_matter.py +86 -0
- play_parser-1.0.0/src/play_parser/parsing/parser.py +292 -0
- play_parser-1.0.0/src/play_parser/parsing/profiles/__init__.py +15 -0
- play_parser-1.0.0/src/play_parser/parsing/profiles/builtins/__init__.py +1 -0
- play_parser-1.0.0/src/play_parser/parsing/profiles/builtins/colon_inline.json +15 -0
- play_parser-1.0.0/src/play_parser/parsing/profiles/builtins/dot_block.json +16 -0
- play_parser-1.0.0/src/play_parser/parsing/profiles/builtins/dot_inline.json +16 -0
- play_parser-1.0.0/src/play_parser/parsing/profiles/builtins/mixed_parenthetical.json +17 -0
- play_parser-1.0.0/src/play_parser/parsing/profiles/builtins/narrative_stage_heavy.json +17 -0
- play_parser-1.0.0/src/play_parser/parsing/profiles/loader.py +149 -0
- play_parser-1.0.0/src/play_parser/parsing/profiles/schema.py +139 -0
- play_parser-1.0.0/src/play_parser/parsing/speakers.py +130 -0
- play_parser-1.0.0/src/play_parser/parsing/speech.py +604 -0
- play_parser-1.0.0/src/play_parser/parsing/stage.py +178 -0
- play_parser-1.0.0/src/play_parser/parsing/structure.py +87 -0
- play_parser-1.0.0/src/play_parser/py.typed +0 -0
- play_parser-1.0.0/src/play_parser.egg-info/PKG-INFO +195 -0
- play_parser-1.0.0/src/play_parser.egg-info/SOURCES.txt +135 -0
- play_parser-1.0.0/src/play_parser.egg-info/dependency_links.txt +1 -0
- play_parser-1.0.0/src/play_parser.egg-info/entry_points.txt +2 -0
- play_parser-1.0.0/src/play_parser.egg-info/requires.txt +6 -0
- play_parser-1.0.0/src/play_parser.egg-info/top_level.txt +1 -0
- play_parser-1.0.0/tests/test_document_roundtrip.py +83 -0
- play_parser-1.0.0/tests/test_domain.py +90 -0
- play_parser-1.0.0/tests/test_ingestor.py +110 -0
- play_parser-1.0.0/tests/test_parser_core.py +83 -0
- play_parser-1.0.0/tests/test_profiles.py +161 -0
- play_parser-1.0.0/tests/test_public_api.py +32 -0
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## 1.0.0
|
|
4
|
+
|
|
5
|
+
Initial public release.
|
|
6
|
+
|
|
7
|
+
### Features
|
|
8
|
+
|
|
9
|
+
- Parse raw `.txt` play files into canonical JSON documents.
|
|
10
|
+
- Assemble canonical JSON documents back into normalised play text.
|
|
11
|
+
- Support explicit format profiles for speaker labels and stage directions.
|
|
12
|
+
- Include built-in profiles for common dramatic text layouts.
|
|
13
|
+
- Validate canonical play documents with a shared validator and JSON Schema.
|
|
14
|
+
- Expose a Python API for ingestion, domain access, validation, assembly, and profiles.
|
|
15
|
+
- Provide a `play-parser` CLI for parsing and assembling single files or folders.
|
|
16
|
+
- Include a corpus-backed test suite covering parsing, profiles, ingestion, domain objects, and roundtrips.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Stergios Poularakis
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: play-parser
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Parse dramatic play text into ordered dramatic events.
|
|
5
|
+
Author: Stergios Poularakis
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/stpoular/play-parser
|
|
8
|
+
Project-URL: Documentation, https://github.com/stpoular/play-parser/tree/main/docs
|
|
9
|
+
Project-URL: Repository, https://github.com/stpoular/play-parser
|
|
10
|
+
Project-URL: Issues, https://github.com/stpoular/play-parser/issues
|
|
11
|
+
Project-URL: Changelog, https://github.com/stpoular/play-parser/blob/main/CHANGELOG.md
|
|
12
|
+
Keywords: theatre,drama,plays,parser,json
|
|
13
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Operating System :: OS Independent
|
|
22
|
+
Classifier: Topic :: Text Processing
|
|
23
|
+
Classifier: Typing :: Typed
|
|
24
|
+
Requires-Python: >=3.10
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: build>=1; extra == "dev"
|
|
29
|
+
Requires-Dist: pytest>=8; extra == "dev"
|
|
30
|
+
Requires-Dist: ruff>=0.8; extra == "dev"
|
|
31
|
+
Requires-Dist: twine>=5; extra == "dev"
|
|
32
|
+
Dynamic: license-file
|
|
33
|
+
|
|
34
|
+
# play-parser
|
|
35
|
+
|
|
36
|
+
`play-parser` parses theatrical play text into a canonical JSON document and assembles canonical documents back into normalised play text.
|
|
37
|
+
|
|
38
|
+
Canonical text uses a stable output format. For example, speech labels are emitted in colon form such as `Hamlet: ...`, even when the source text used another supported layout.
|
|
39
|
+
|
|
40
|
+
## Features
|
|
41
|
+
|
|
42
|
+
- Parse raw `.txt` play files into structured JSON.
|
|
43
|
+
- Assemble canonical JSON documents into normalised `.txt` output.
|
|
44
|
+
- Read and validate existing canonical `.json` documents.
|
|
45
|
+
- Work with explicit parsing profiles for different source formats.
|
|
46
|
+
- Preserve speeches, stage directions, acts, scenes, metadata, characters, and document statistics.
|
|
47
|
+
- Use the package from Python or through the `play-parser` command line interface.
|
|
48
|
+
|
|
49
|
+
## Supported inputs
|
|
50
|
+
|
|
51
|
+
- Raw `.txt` play files.
|
|
52
|
+
- Canonical `.json` documents produced by this package.
|
|
53
|
+
|
|
54
|
+
The package does not parse PDFs, DOCX files, HTML pages, scans, images, or audio directly. Convert those sources to text first.
|
|
55
|
+
|
|
56
|
+
## Installation
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
pip install play-parser
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Python quick start
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
from play_parser import Play, PlayIngestor
|
|
66
|
+
|
|
67
|
+
ingestor = PlayIngestor("Hamlet.txt", profile="colon_inline")
|
|
68
|
+
play = Play(ingestor.data)
|
|
69
|
+
|
|
70
|
+
print(play.title)
|
|
71
|
+
print(play.author)
|
|
72
|
+
print(len(play.acts))
|
|
73
|
+
print(len(play.scenes))
|
|
74
|
+
print(len(play.characters))
|
|
75
|
+
print(len(play.speeches))
|
|
76
|
+
|
|
77
|
+
play.save_json("Hamlet.json")
|
|
78
|
+
play.save_text("Hamlet.canonical.txt")
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Parse text that is already in memory:
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
from play_parser import PlayIngestor
|
|
85
|
+
|
|
86
|
+
text = "ACT I\n\nSCENE I.\n\nHAMLET: Who's there?"
|
|
87
|
+
ingestor = PlayIngestor.from_text(text, source_name="Hamlet.txt", profile="colon_inline")
|
|
88
|
+
document = ingestor.data
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
Assemble a canonical document:
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
from play_parser import assemble_play_text
|
|
95
|
+
|
|
96
|
+
canonical_text = assemble_play_text(document)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## Command line usage
|
|
100
|
+
|
|
101
|
+
Show help and version information:
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
play-parser --help
|
|
105
|
+
play-parser --version
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
Parse one file:
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
play-parser parse Hamlet.txt \
|
|
112
|
+
--profile colon_inline \
|
|
113
|
+
--json-output Hamlet.json \
|
|
114
|
+
--text-output Hamlet.canonical.txt
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
Parse a folder recursively:
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
play-parser parse \
|
|
121
|
+
--input-root data/text/raw \
|
|
122
|
+
--recursive \
|
|
123
|
+
--profile colon_inline \
|
|
124
|
+
--json-output-root data/json/generated
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
Assemble canonical JSON files into text:
|
|
128
|
+
|
|
129
|
+
```bash
|
|
130
|
+
play-parser assemble \
|
|
131
|
+
--input-root data/json/generated \
|
|
132
|
+
--recursive \
|
|
133
|
+
--output-root data/text/canonical
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## Public API
|
|
137
|
+
|
|
138
|
+
Stable top-level imports:
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
from play_parser import (
|
|
142
|
+
Play,
|
|
143
|
+
PlayIngestor,
|
|
144
|
+
assemble_play_text,
|
|
145
|
+
get_format_profile,
|
|
146
|
+
list_format_profiles,
|
|
147
|
+
load_format_profile_config,
|
|
148
|
+
load_format_profile_file,
|
|
149
|
+
validate_play_document,
|
|
150
|
+
)
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
Domain classes such as `Act`, `Scene`, `Speech`, `Character`, `Monologue`, and `Dialogue` are also available from the top-level package.
|
|
154
|
+
|
|
155
|
+
## Format profiles
|
|
156
|
+
|
|
157
|
+
Built-in profiles are available through `list_format_profiles()` and can be passed to `PlayIngestor` or the CLI by name.
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
from play_parser import list_format_profiles
|
|
161
|
+
|
|
162
|
+
print(list_format_profiles())
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
See [`docs/FORMAT_PROFILES.md`](docs/FORMAT_PROFILES.md) for the profile schema and examples.
|
|
166
|
+
|
|
167
|
+
## Documentation
|
|
168
|
+
|
|
169
|
+
- [`docs/API.md`](docs/API.md): Python API and CLI profile usage.
|
|
170
|
+
- [`docs/JSON_SCHEMA.md`](docs/JSON_SCHEMA.md): canonical JSON document format.
|
|
171
|
+
- [`docs/FORMAT_PROFILES.md`](docs/FORMAT_PROFILES.md): built-in and custom format profiles.
|
|
172
|
+
|
|
173
|
+
## Development
|
|
174
|
+
|
|
175
|
+
Install development dependencies:
|
|
176
|
+
|
|
177
|
+
```bash
|
|
178
|
+
python -m pip install -e .[dev]
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
Run local checks:
|
|
182
|
+
|
|
183
|
+
```bash
|
|
184
|
+
python -m ruff check .
|
|
185
|
+
python -m ruff format --check .
|
|
186
|
+
python -m unittest discover -s tests
|
|
187
|
+
python -m build
|
|
188
|
+
python -m twine check dist/*
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
Release steps are documented in [`RELEASE.md`](RELEASE.md).
|
|
192
|
+
|
|
193
|
+
## Licence
|
|
194
|
+
|
|
195
|
+
MIT
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
# play-parser
|
|
2
|
+
|
|
3
|
+
`play-parser` parses theatrical play text into a canonical JSON document and assembles canonical documents back into normalised play text.
|
|
4
|
+
|
|
5
|
+
Canonical text uses a stable output format. For example, speech labels are emitted in colon form such as `Hamlet: ...`, even when the source text used another supported layout.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- Parse raw `.txt` play files into structured JSON.
|
|
10
|
+
- Assemble canonical JSON documents into normalised `.txt` output.
|
|
11
|
+
- Read and validate existing canonical `.json` documents.
|
|
12
|
+
- Work with explicit parsing profiles for different source formats.
|
|
13
|
+
- Preserve speeches, stage directions, acts, scenes, metadata, characters, and document statistics.
|
|
14
|
+
- Use the package from Python or through the `play-parser` command line interface.
|
|
15
|
+
|
|
16
|
+
## Supported inputs
|
|
17
|
+
|
|
18
|
+
- Raw `.txt` play files.
|
|
19
|
+
- Canonical `.json` documents produced by this package.
|
|
20
|
+
|
|
21
|
+
The package does not parse PDFs, DOCX files, HTML pages, scans, images, or audio directly. Convert those sources to text first.
|
|
22
|
+
|
|
23
|
+
## Installation
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pip install play-parser
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Python quick start
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
from play_parser import Play, PlayIngestor
|
|
33
|
+
|
|
34
|
+
ingestor = PlayIngestor("Hamlet.txt", profile="colon_inline")
|
|
35
|
+
play = Play(ingestor.data)
|
|
36
|
+
|
|
37
|
+
print(play.title)
|
|
38
|
+
print(play.author)
|
|
39
|
+
print(len(play.acts))
|
|
40
|
+
print(len(play.scenes))
|
|
41
|
+
print(len(play.characters))
|
|
42
|
+
print(len(play.speeches))
|
|
43
|
+
|
|
44
|
+
play.save_json("Hamlet.json")
|
|
45
|
+
play.save_text("Hamlet.canonical.txt")
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Parse text that is already in memory:
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
from play_parser import PlayIngestor
|
|
52
|
+
|
|
53
|
+
text = "ACT I\n\nSCENE I.\n\nHAMLET: Who's there?"
|
|
54
|
+
ingestor = PlayIngestor.from_text(text, source_name="Hamlet.txt", profile="colon_inline")
|
|
55
|
+
document = ingestor.data
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Assemble a canonical document:
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from play_parser import assemble_play_text
|
|
62
|
+
|
|
63
|
+
canonical_text = assemble_play_text(document)
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Command line usage
|
|
67
|
+
|
|
68
|
+
Show help and version information:
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
play-parser --help
|
|
72
|
+
play-parser --version
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Parse one file:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
play-parser parse Hamlet.txt \
|
|
79
|
+
--profile colon_inline \
|
|
80
|
+
--json-output Hamlet.json \
|
|
81
|
+
--text-output Hamlet.canonical.txt
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
Parse a folder recursively:
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
play-parser parse \
|
|
88
|
+
--input-root data/text/raw \
|
|
89
|
+
--recursive \
|
|
90
|
+
--profile colon_inline \
|
|
91
|
+
--json-output-root data/json/generated
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
Assemble canonical JSON files into text:
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
play-parser assemble \
|
|
98
|
+
--input-root data/json/generated \
|
|
99
|
+
--recursive \
|
|
100
|
+
--output-root data/text/canonical
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Public API
|
|
104
|
+
|
|
105
|
+
Stable top-level imports:
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from play_parser import (
|
|
109
|
+
Play,
|
|
110
|
+
PlayIngestor,
|
|
111
|
+
assemble_play_text,
|
|
112
|
+
get_format_profile,
|
|
113
|
+
list_format_profiles,
|
|
114
|
+
load_format_profile_config,
|
|
115
|
+
load_format_profile_file,
|
|
116
|
+
validate_play_document,
|
|
117
|
+
)
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
Domain classes such as `Act`, `Scene`, `Speech`, `Character`, `Monologue`, and `Dialogue` are also available from the top-level package.
|
|
121
|
+
|
|
122
|
+
## Format profiles
|
|
123
|
+
|
|
124
|
+
Built-in profiles are available through `list_format_profiles()` and can be passed to `PlayIngestor` or the CLI by name.
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
from play_parser import list_format_profiles
|
|
128
|
+
|
|
129
|
+
print(list_format_profiles())
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
See [`docs/FORMAT_PROFILES.md`](docs/FORMAT_PROFILES.md) for the profile schema and examples.
|
|
133
|
+
|
|
134
|
+
## Documentation
|
|
135
|
+
|
|
136
|
+
- [`docs/API.md`](docs/API.md): Python API and CLI profile usage.
|
|
137
|
+
- [`docs/JSON_SCHEMA.md`](docs/JSON_SCHEMA.md): canonical JSON document format.
|
|
138
|
+
- [`docs/FORMAT_PROFILES.md`](docs/FORMAT_PROFILES.md): built-in and custom format profiles.
|
|
139
|
+
|
|
140
|
+
## Development
|
|
141
|
+
|
|
142
|
+
Install development dependencies:
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
python -m pip install -e .[dev]
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
Run local checks:
|
|
149
|
+
|
|
150
|
+
```bash
|
|
151
|
+
python -m ruff check .
|
|
152
|
+
python -m ruff format --check .
|
|
153
|
+
python -m unittest discover -s tests
|
|
154
|
+
python -m build
|
|
155
|
+
python -m twine check dist/*
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
Release steps are documented in [`RELEASE.md`](RELEASE.md).
|
|
159
|
+
|
|
160
|
+
## Licence
|
|
161
|
+
|
|
162
|
+
MIT
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# Release checklist
|
|
2
|
+
|
|
3
|
+
Use this checklist when publishing a new public release.
|
|
4
|
+
|
|
5
|
+
## 1. Prepare the repository
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
python -m pip install -e .[dev]
|
|
9
|
+
python -m ruff check .
|
|
10
|
+
python -m ruff format --check .
|
|
11
|
+
python -m unittest discover -s tests
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
Check that the version in `pyproject.toml` and `src/play_parser/__init__.py` has been updated. For the first `1.0.0` release, keep `CHANGELOG.md` simple: list the current release features only, or leave it with no historical entries.
|
|
15
|
+
|
|
16
|
+
## 2. Build and inspect distributions
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
rm -rf dist build src/*.egg-info
|
|
20
|
+
python -m build
|
|
21
|
+
python -m twine check dist/*
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Optional local wheel smoke test:
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
python -m venv /tmp/play-parser-release-test
|
|
28
|
+
/tmp/play-parser-release-test/bin/python -m pip install dist/*.whl
|
|
29
|
+
/tmp/play-parser-release-test/bin/play-parser --version
|
|
30
|
+
/tmp/play-parser-release-test/bin/play-parser --help
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## 3. Publish to TestPyPI first
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
python -m twine upload --repository testpypi dist/*
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Install from TestPyPI in a fresh environment and check the CLI:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
python -m venv /tmp/play-parser-testpypi
|
|
43
|
+
/tmp/play-parser-testpypi/bin/python -m pip install \
|
|
44
|
+
--index-url https://test.pypi.org/simple/ \
|
|
45
|
+
--extra-index-url https://pypi.org/simple/ \
|
|
46
|
+
play-parser
|
|
47
|
+
/tmp/play-parser-testpypi/bin/play-parser --version
|
|
48
|
+
/tmp/play-parser-testpypi/bin/play-parser --help
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## 4. Publish to PyPI
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
python -m twine upload dist/*
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## 5. Tag the release
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
git tag v1.0.0
|
|
61
|
+
git push origin v1.0.0
|
|
62
|
+
```
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# Security Policy
|
|
2
|
+
|
|
3
|
+
## Supported versions
|
|
4
|
+
|
|
5
|
+
Security fixes are considered for the latest released version of `play-parser`.
|
|
6
|
+
|
|
7
|
+
## Reporting a vulnerability
|
|
8
|
+
|
|
9
|
+
Please report security issues privately by email rather than opening a public issue.
|
|
10
|
+
|
|
11
|
+
Include a clear description, reproduction steps where possible, and the version affected. Please do not disclose the issue publicly until it has been assessed.
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
# Test corpus data
|
|
2
|
+
|
|
3
|
+
The files in this directory are used as test fixtures and reference corpus material for parser development.
|
|
4
|
+
|
|
5
|
+
The raw play texts are public-domain source texts. The canonical JSON files are generated or curated fixtures used to validate parser behaviour.
|
|
6
|
+
|
|
7
|
+
These files are not required for normal package use.
|