PyPI - play-parser - Versions diffs - 1.0.0__py3-none-any.whl - Mend

play-parser 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

play_parser/__init__.py +32 -0
play_parser/__main__.py +6 -0
play_parser/_io.py +51 -0
play_parser/cli/__init__.py +3 -0
play_parser/cli/main.py +240 -0
play_parser/document/__init__.py +21 -0
play_parser/document/assembler.py +185 -0
play_parser/document/builder.py +170 -0
play_parser/document/constants.py +17 -0
play_parser/document/text.py +15 -0
play_parser/document/types.py +92 -0
play_parser/document/validation.py +250 -0
play_parser/domain/__init__.py +11 -0
play_parser/domain/play.py +743 -0
play_parser/ingestion/__init__.py +3 -0
play_parser/ingestion/ingestor.py +181 -0
play_parser/parsing/__init__.py +18 -0
play_parser/parsing/context.py +103 -0
play_parser/parsing/front_matter.py +86 -0
play_parser/parsing/parser.py +292 -0
play_parser/parsing/profiles/__init__.py +15 -0
play_parser/parsing/profiles/builtins/__init__.py +1 -0
play_parser/parsing/profiles/builtins/colon_inline.json +15 -0
play_parser/parsing/profiles/builtins/dot_block.json +16 -0
play_parser/parsing/profiles/builtins/dot_inline.json +16 -0
play_parser/parsing/profiles/builtins/mixed_parenthetical.json +17 -0
play_parser/parsing/profiles/builtins/narrative_stage_heavy.json +17 -0
play_parser/parsing/profiles/loader.py +149 -0
play_parser/parsing/profiles/schema.py +139 -0
play_parser/parsing/speakers.py +130 -0
play_parser/parsing/speech.py +604 -0
play_parser/parsing/stage.py +178 -0
play_parser/parsing/structure.py +87 -0
play_parser/py.typed +0 -0
play_parser-1.0.0.dist-info/METADATA +195 -0
play_parser-1.0.0.dist-info/RECORD +40 -0
play_parser-1.0.0.dist-info/WHEEL +5 -0
play_parser-1.0.0.dist-info/entry_points.txt +2 -0
play_parser-1.0.0.dist-info/licenses/LICENSE +21 -0
play_parser-1.0.0.dist-info/top_level.txt +1 -0

play_parser/__init__.py ADDED Viewed

@@ -0,0 +1,32 @@
+from play_parser.document.assembler import assemble_play_text
+from play_parser.document.validation import validate_play_document
+from play_parser.domain import Act, Character, Dialogue, Monologue, Play, Scene, Speech
+from play_parser.ingestion import PlayIngestor
+from play_parser.parsing.profiles import (
+    FormatProfile,
+    get_format_profile,
+    list_format_profiles,
+    load_format_profile_config,
+    load_format_profile_file,
+)
+__version__ = "1.0.0"
+__all__ = [
+    "__version__",
+    "Act",
+    "Character",
+    "Dialogue",
+    "FormatProfile",
+    "Monologue",
+    "Play",
+    "PlayIngestor",
+    "Scene",
+    "Speech",
+    "assemble_play_text",
+    "get_format_profile",
+    "list_format_profiles",
+    "load_format_profile_config",
+    "load_format_profile_file",
+    "validate_play_document",
+]

play_parser/__main__.py ADDED Viewed

@@ -0,0 +1,6 @@
+from __future__ import annotations
+from play_parser.cli import main
+if __name__ == "__main__":
+    raise SystemExit(main())

play_parser/_io.py ADDED Viewed

@@ -0,0 +1,51 @@
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Any
+DEFAULT_MAX_INPUT_BYTES = 10 * 1024 * 1024
+def check_file_size(path: str | Path, *, max_bytes: int = DEFAULT_MAX_INPUT_BYTES) -> Path:
+    input_path = Path(path)
+    try:
+        size = input_path.stat().st_size
+    except FileNotFoundError as exc:
+        raise ValueError(f"File not found: {input_path}") from exc
+    except PermissionError as exc:
+        raise ValueError(f"Permission denied reading file: {input_path}") from exc
+    if size > max_bytes:
+        raise ValueError(
+            f"Input file is too large: {input_path} is {size} bytes; maximum supported size is {max_bytes} bytes"
+        )
+    return input_path
+def read_text_file(path: str | Path, *, max_bytes: int = DEFAULT_MAX_INPUT_BYTES) -> str:
+    input_path = check_file_size(path, max_bytes=max_bytes)
+    try:
+        return input_path.read_text(encoding="utf-8")
+    except PermissionError as exc:
+        raise ValueError(f"Permission denied reading file: {input_path}") from exc
+    except UnicodeDecodeError as exc:
+        raise ValueError(f"File is not valid UTF-8 text: {input_path}") from exc
+def read_json_file(path: str | Path, *, max_bytes: int = DEFAULT_MAX_INPUT_BYTES) -> Any:
+    input_path = Path(path)
+    text = read_text_file(input_path, max_bytes=max_bytes)
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError as exc:
+        raise ValueError(
+            f"Invalid JSON in file {input_path}: {exc.msg} at line {exc.lineno}, column {exc.colno}"
+        ) from exc
+def parse_json_text(text: str, *, label: str = "JSON text") -> Any:
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError as exc:
+        raise ValueError(f"Invalid {label}: {exc.msg} at line {exc.lineno}, column {exc.colno}") from exc

play_parser/cli/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .main import main
+__all__ = ["main"]

play_parser/cli/main.py ADDED Viewed

@@ -0,0 +1,240 @@
+from __future__ import annotations
+import argparse
+import json
+from pathlib import Path
+from play_parser import __version__
+from play_parser.document.assembler import assemble_play_text
+from play_parser.ingestion import PlayIngestor
+DEFAULT_JSON_INDENT = 2
+def build_cli_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description="Parse raw play text to JSON/canonical text, or assemble canonical text from JSON."
+    )
+    parser.add_argument("--version", action="version", version=f"play-parser {__version__}")
+    subparsers = parser.add_subparsers(dest="command", required=True)
+    add_parse_command(subparsers)
+    add_assemble_command(subparsers)
+    return parser
+def add_parse_command(subparsers: argparse._SubParsersAction[argparse.ArgumentParser]) -> None:
+    parse_parser = subparsers.add_parser("parse", help="Parse raw .txt files into JSON and/or canonical text.")
+    parse_parser.add_argument(
+        "inputs",
+        nargs="*",
+        type=Path,
+        help="Input .txt files or folders. If omitted, all .txt files under --input-root are processed.",
+    )
+    parse_parser.add_argument(
+        "--input-root",
+        type=Path,
+        default=Path.cwd(),
+        help=(
+            "Default folder to read raw .txt files from when no explicit inputs are given. "
+            "Defaults to the current working directory."
+        ),
+    )
+    parse_parser.add_argument("--recursive", action="store_true", help="Traverse input directories recursively.")
+    parse_parser.add_argument("--json-output", type=Path, help="Write JSON for exactly one input file.")
+    parse_parser.add_argument("--json-output-root", type=Path, help="Write one .json file per input into this folder.")
+    parse_parser.add_argument(
+        "--text-output",
+        type=Path,
+        help="Write canonical assembled .txt for exactly one input file.",
+    )
+    parse_parser.add_argument(
+        "--text-output-root",
+        type=Path,
+        help="Write one canonical .txt file per input into this folder.",
+    )
+    parse_parser.add_argument("--profile", help="Built-in format profile name to use while parsing raw text.")
+    parse_parser.add_argument("--profile-path", type=Path, help="Path to a custom format profile JSON file.")
+    parse_parser.add_argument("--indent", type=int, default=DEFAULT_JSON_INDENT, help="JSON indentation level.")
+def add_assemble_command(subparsers: argparse._SubParsersAction[argparse.ArgumentParser]) -> None:
+    assemble_parser = subparsers.add_parser("assemble", help="Assemble canonical .txt from JSON files.")
+    assemble_parser.add_argument(
+        "inputs",
+        nargs="*",
+        type=Path,
+        help="Input .json files or folders. If omitted, all .json files under --input-root are processed.",
+    )
+    assemble_parser.add_argument(
+        "--input-root",
+        type=Path,
+        default=Path.cwd(),
+        help=(
+            "Default folder to read .json files from when no explicit inputs are given. "
+            "Defaults to the current working directory."
+        ),
+    )
+    assemble_parser.add_argument("--recursive", action="store_true", help="Traverse input directories recursively.")
+    assemble_parser.add_argument("--output", type=Path, help="Write canonical text for exactly one input file.")
+    assemble_parser.add_argument(
+        "--output-root",
+        type=Path,
+        help="Write one canonical .txt file per input into this folder.",
+    )
+def resolve_input_files(
+    input_paths: list[Path],
+    input_root: Path,
+    suffix: str,
+    *,
+    recursive: bool = False,
+) -> list[Path]:
+    resolved: list[Path] = []
+    candidates = input_paths or [input_root]
+    for candidate in candidates:
+        path = resolve_candidate_path(candidate, input_root)
+        if not path.exists():
+            raise SystemExit(f"Input path not found: {path}")
+        if path.is_dir():
+            walker = path.rglob if recursive else path.glob
+            resolved.extend(sorted(item.resolve() for item in walker(f"*{suffix}") if item.is_file()))
+            continue
+        if path.suffix.lower() != suffix:
+            raise SystemExit(f"Expected {suffix} input, got: {path}")
+        resolved.append(path.resolve())
+    unique: list[Path] = []
+    seen: set[Path] = set()
+    for path in resolved:
+        if path not in seen:
+            unique.append(path)
+            seen.add(path)
+    return unique
+def resolve_candidate_path(candidate: Path, input_root: Path) -> Path:
+    if candidate.is_absolute():
+        return candidate
+    if candidate.exists():
+        return candidate.resolve()
+    return (input_root / candidate).resolve()
+def ensure_single_input_output(
+    single_output: Path | None,
+    output_root: Path | None,
+    input_files: list[Path],
+    flag: str,
+) -> None:
+    if single_output is not None and len(input_files) != 1:
+        raise SystemExit(f"{flag} can only be used when processing exactly one input file")
+    if single_output is not None and output_root is not None:
+        raise SystemExit(f"Cannot use {flag} together with its corresponding --*-root option")
+def json_output_path_for(input_path: Path, args: argparse.Namespace) -> Path | None:
+    if args.json_output is not None:
+        return args.json_output.resolve()
+    if args.json_output_root is not None:
+        return args.json_output_root.resolve() / f"{input_path.stem}.json"
+    return None
+def text_output_path_for(input_path: Path, args: argparse.Namespace, *, from_json: bool) -> Path | None:
+    if from_json:
+        if args.output is not None:
+            return args.output.resolve()
+        if args.output_root is not None:
+            return args.output_root.resolve() / f"{input_path.stem}.txt"
+        return None
+    if args.text_output is not None:
+        return args.text_output.resolve()
+    if args.text_output_root is not None:
+        return args.text_output_root.resolve() / f"{input_path.stem}.txt"
+    return None
+def write_text(path: Path, text: str) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(text, encoding="utf-8")
+def write_json(path: Path, payload: object, *, indent: int) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(payload, ensure_ascii=False, indent=indent) + "\n", encoding="utf-8")
+def run_parse(args: argparse.Namespace) -> int:
+    input_root = args.input_root.resolve()
+    input_files = resolve_input_files(args.inputs, input_root, ".txt", recursive=args.recursive)
+    if not input_files:
+        raise SystemExit(f"No .txt files found under {input_root}")
+    if args.profile is not None and args.profile_path is not None:
+        raise SystemExit("Cannot use --profile together with --profile-path")
+    ensure_single_input_output(args.json_output, args.json_output_root, input_files, "--json-output")
+    ensure_single_input_output(args.text_output, args.text_output_root, input_files, "--text-output")
+    for input_path in input_files:
+        document = PlayIngestor(input_path, profile=args.profile, profile_path=args.profile_path).data
+        json_output = json_output_path_for(input_path, args)
+        text_output = text_output_path_for(input_path, args, from_json=False)
+        if json_output is not None:
+            write_json(json_output, document, indent=args.indent)
+        if text_output is not None:
+            write_text(text_output, assemble_play_text(document))
+        report_outputs("Parsed", input_path, json_output=json_output, text_output=text_output)
+    return 0
+def run_assemble(args: argparse.Namespace) -> int:
+    input_root = args.input_root.resolve()
+    input_files = resolve_input_files(args.inputs, input_root, ".json", recursive=args.recursive)
+    if not input_files:
+        raise SystemExit(f"No .json files found under {input_root}")
+    ensure_single_input_output(args.output, args.output_root, input_files, "--output")
+    for input_path in input_files:
+        text_output = text_output_path_for(input_path, args, from_json=True)
+        assembled = assemble_play_text(input_path)
+        if text_output is not None:
+            write_text(text_output, assembled)
+        report_outputs("Assembled", input_path, text_output=text_output)
+    return 0
+def report_outputs(
+    action: str,
+    input_path: Path,
+    *,
+    json_output: Path | None = None,
+    text_output: Path | None = None,
+) -> None:
+    outputs: list[str] = []
+    if json_output is not None:
+        outputs.append(f"json={json_output}")
+    if text_output is not None:
+        outputs.append(f"text={text_output}")
+    if outputs:
+        print(f"{action} {input_path} -> " + ", ".join(outputs))
+    else:
+        print(f"{action} {input_path} (no files written)")
+def main() -> int:
+    args = build_cli_parser().parse_args()
+    if args.command == "parse":
+        return run_parse(args)
+    if args.command == "assemble":
+        return run_assemble(args)
+    raise SystemExit(f"Unsupported command: {args.command}")

play_parser/document/__init__.py ADDED Viewed

@@ -0,0 +1,21 @@
+from .assembler import assemble_play_text
+from .builder import (
+    build_play_document,
+    build_play_document_from_file,
+    build_play_events,
+    build_play_events_from_file,
+    write_json_file,
+)
+from .validation import validate_events, validate_play_document, validate_roundtrip
+__all__ = [
+    "assemble_play_text",
+    "build_play_document",
+    "build_play_document_from_file",
+    "build_play_events",
+    "build_play_events_from_file",
+    "write_json_file",
+    "validate_events",
+    "validate_play_document",
+    "validate_roundtrip",
+]

play_parser/document/assembler.py ADDED Viewed

@@ -0,0 +1,185 @@
+from __future__ import annotations
+from collections.abc import Sequence
+from pathlib import Path
+from typing import Any
+from play_parser._io import parse_json_text, read_json_file
+from .types import PlayDocument, PlayEvent, SpeechEvent
+def assemble_play_text(events_or_json: Sequence[PlayEvent] | PlayDocument | str | Path) -> str:
+    events = _coerce_events(events_or_json)
+    return PlayTextAssembler().assemble(events)
+class PlayTextAssembler:
+    """Formats canonical play events back into plain text."""
+    def __init__(self) -> None:
+        self.pending_attached_stage_directions: list[str] = []
+        self.last_speech_speaker: str | None = None
+        self.last_speech_index = -1
+        self.scene_boundary_indices: set[int] = set()
+    def assemble(self, events: Sequence[PlayEvent]) -> str:
+        blocks: list[str] = []
+        self.scene_boundary_indices = {
+            index for index, event in enumerate(events) if event["type"] in {"act_start", "scene_start"}
+        }
+        for index, event in enumerate(events):
+            if event["type"] == "stage_direction" and event.get("attachment") == "next":
+                self.pending_attached_stage_directions.append(event["text"])
+                continue
+            block = self.format_event(event, events=events, index=index)
+            if block:
+                blocks.append(block)
+        if self.pending_attached_stage_directions:
+            blocks.extend(self.pending_attached_stage_directions)
+            self.pending_attached_stage_directions = []
+        return "\n\n".join(block for block in blocks if block) + "\n"
+    def format_event(self, event: PlayEvent, *, events: Sequence[PlayEvent], index: int) -> str:
+        if event["type"] == "speech":
+            return self.format_speech(event, events=events, index=index)
+        block = self.format_non_speech(event)
+        self.pending_attached_stage_directions = []
+        if event["type"] in {"act_start", "scene_start"}:
+            self.last_speech_speaker = None
+            self.last_speech_index = -1
+        return block
+    def format_speech(self, event: SpeechEvent, *, events: Sequence[PlayEvent], index: int) -> str:
+        suppress_speaker, bare_speaker_label = self.speaker_label_options(event, events=events, index=index)
+        block = _event_to_block(
+            event,
+            suppress_speaker=suppress_speaker,
+            bare_speaker_label=bare_speaker_label,
+            attached_stage_directions=self.pending_attached_stage_directions,
+        )
+        self.pending_attached_stage_directions = []
+        self.last_speech_speaker = event["speaker"]
+        self.last_speech_index = index
+        return block
+    def speaker_label_options(
+        self,
+        event: SpeechEvent,
+        *,
+        events: Sequence[PlayEvent],
+        index: int,
+    ) -> tuple[bool, bool]:
+        explicit_labelled = event.get("labelled")
+        if explicit_labelled is False:
+            should_suppress = self.last_speech_speaker == event["speaker"]
+            return should_suppress, not should_suppress
+        if explicit_labelled is None and self.can_suppress_repeated_speaker(event, events=events, index=index):
+            return True, False
+        return False, False
+    def can_suppress_repeated_speaker(
+        self,
+        event: SpeechEvent,
+        *,
+        events: Sequence[PlayEvent],
+        index: int,
+    ) -> bool:
+        if self.last_speech_speaker != event["speaker"]:
+            return False
+        if index <= 0 or index - 1 in self.scene_boundary_indices or self.last_speech_index < 0:
+            return False
+        return all(
+            other_event["type"] == "stage_direction" for other_event in events[self.last_speech_index + 1 : index]
+        )
+    def format_non_speech(self, event: PlayEvent) -> str:
+        return _event_to_block(
+            event,
+            attached_stage_directions=self.pending_attached_stage_directions,
+        )
+def _coerce_events(events_or_json: Sequence[PlayEvent] | PlayDocument | str | Path) -> list[PlayEvent]:
+    if isinstance(events_or_json, Path):
+        return _extract_events(read_json_file(events_or_json))
+    if isinstance(events_or_json, str):
+        return _extract_events(parse_json_text(events_or_json))
+    return _extract_events(events_or_json)
+def _extract_events(payload: Sequence[PlayEvent] | PlayDocument | Any) -> list[PlayEvent]:
+    if isinstance(payload, dict) and "events" in payload:
+        return list(payload["events"])
+    return list(payload)
+def _event_to_block(
+    event: PlayEvent,
+    suppress_speaker: bool = False,
+    bare_speaker_label: bool = False,
+    attached_stage_directions: list[str] | None = None,
+) -> str:
+    attached_stage_directions = attached_stage_directions or []
+    event_type = event["type"]
+    if event_type == "speech":
+        lines = event["text"].splitlines() if event["text"] else [""]
+        first_line = lines[0] if lines else ""
+        remainder = lines[1:]
+        prefix = "".join(attached_stage_directions)
+        if bare_speaker_label:
+            speech_lines = lines if event["text"] else []
+            speaker_label = event["speaker"].upper()
+            if prefix:
+                return "\n".join([speaker_label, prefix, *speech_lines])
+            return "\n".join([speaker_label, *speech_lines])
+        if suppress_speaker:
+            head = f"{prefix} {first_line}".strip() if prefix else first_line
+        else:
+            if prefix:
+                head = f"{event['speaker']}: {prefix} {first_line}".rstrip()
+            else:
+                head = f"{event['speaker']}: {first_line}".rstrip()
+        if not remainder:
+            return head
+        return "\n".join([head, *remainder])
+    if event_type == "scene_start":
+        base = event["label"]
+        if attached_stage_directions:
+            return "\n".join([*attached_stage_directions, base])
+        return base
+    if event_type == "act_start":
+        base = event["label"]
+        if attached_stage_directions:
+            return "\n".join([*attached_stage_directions, base])
+        return base
+    if event_type == "stage_direction":
+        base = event["text"]
+        if attached_stage_directions:
+            return "\n".join([*attached_stage_directions, base])
+        return base
+    if event_type == "meta":
+        base = event["text"]
+        if attached_stage_directions:
+            return "\n".join([*attached_stage_directions, base])
+        return base
+    return ""