PyPI - play-parser - Versions diffs - 1.0.0__py3-none-any.whl - Mend

play-parser 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

play_parser/__init__.py +32 -0
play_parser/__main__.py +6 -0
play_parser/_io.py +51 -0
play_parser/cli/__init__.py +3 -0
play_parser/cli/main.py +240 -0
play_parser/document/__init__.py +21 -0
play_parser/document/assembler.py +185 -0
play_parser/document/builder.py +170 -0
play_parser/document/constants.py +17 -0
play_parser/document/text.py +15 -0
play_parser/document/types.py +92 -0
play_parser/document/validation.py +250 -0
play_parser/domain/__init__.py +11 -0
play_parser/domain/play.py +743 -0
play_parser/ingestion/__init__.py +3 -0
play_parser/ingestion/ingestor.py +181 -0
play_parser/parsing/__init__.py +18 -0
play_parser/parsing/context.py +103 -0
play_parser/parsing/front_matter.py +86 -0
play_parser/parsing/parser.py +292 -0
play_parser/parsing/profiles/__init__.py +15 -0
play_parser/parsing/profiles/builtins/__init__.py +1 -0
play_parser/parsing/profiles/builtins/colon_inline.json +15 -0
play_parser/parsing/profiles/builtins/dot_block.json +16 -0
play_parser/parsing/profiles/builtins/dot_inline.json +16 -0
play_parser/parsing/profiles/builtins/mixed_parenthetical.json +17 -0
play_parser/parsing/profiles/builtins/narrative_stage_heavy.json +17 -0
play_parser/parsing/profiles/loader.py +149 -0
play_parser/parsing/profiles/schema.py +139 -0
play_parser/parsing/speakers.py +130 -0
play_parser/parsing/speech.py +604 -0
play_parser/parsing/stage.py +178 -0
play_parser/parsing/structure.py +87 -0
play_parser/py.typed +0 -0
play_parser-1.0.0.dist-info/METADATA +195 -0
play_parser-1.0.0.dist-info/RECORD +40 -0
play_parser-1.0.0.dist-info/WHEEL +5 -0
play_parser-1.0.0.dist-info/entry_points.txt +2 -0
play_parser-1.0.0.dist-info/licenses/LICENSE +21 -0
play_parser-1.0.0.dist-info/top_level.txt +1 -0

play_parser/document/builder.py ADDED Viewed

@@ -0,0 +1,170 @@
+from __future__ import annotations
+import json
+import re
+from collections.abc import Sequence
+from pathlib import Path
+from play_parser._io import read_text_file
+from play_parser.document.text import count_words
+from play_parser.parsing.parser import parse_play_text
+from play_parser.parsing.profiles import FormatProfile
+from .types import PlayDocument, PlayEvent
+from .validation import validate_events
+WHITESPACE_RE = re.compile(r"\s+")
+DEFAULT_JSON_INDENT = 2
+AUTHOR_BYLINE_PREFIX = "by "
+def build_play_events(
+    text: str,
+    *,
+    profile: FormatProfile | None = None,
+) -> list[PlayEvent]:
+    events = parse_play_text(text, profile=profile)
+    validate_events(events)
+    return events
+def build_play_document(
+    text: str,
+    *,
+    play_name: str | None = None,
+    profile: FormatProfile | None = None,
+) -> PlayDocument:
+    events = build_play_events(text, profile=profile)
+    return {
+        "metadata": build_play_metadata(events, text, play_name=play_name),
+        "events": events,
+    }
+def build_play_events_from_file(path: str | Path, *, profile: FormatProfile | None = None) -> list[PlayEvent]:
+    input_path = Path(path)
+    return build_play_events(
+        read_text_file(input_path),
+        profile=profile,
+    )
+def build_play_document_from_file(path: str | Path, *, profile: FormatProfile | None = None) -> PlayDocument:
+    input_path = Path(path)
+    return build_play_document(
+        read_text_file(input_path),
+        play_name=input_path.stem,
+        profile=profile,
+    )
+def write_json_file(
+    input_path: str | Path,
+    output_path: str | Path,
+    *,
+    indent: int = DEFAULT_JSON_INDENT,
+    profile: FormatProfile | None = None,
+) -> PlayDocument:
+    document = build_play_document_from_file(input_path, profile=profile)
+    output_file = Path(output_path)
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+    output_file.write_text(json.dumps(document, ensure_ascii=False, indent=indent) + "\n", encoding="utf-8")
+    return document
+def build_play_metadata(
+    events: Sequence[PlayEvent],
+    raw_text: str,
+    *,
+    play_name: str | None = None,
+) -> dict[str, object]:
+    title = extract_title(events, play_name=play_name)
+    author = extract_author(events)
+    acts, scenes = collect_structure(events)
+    return {
+        "title": title,
+        "author": author,
+        "characters": collect_characters(events),
+        "acts": acts,
+        "scenes": scenes,
+        "stats": {
+            "stage_directions": sum(1 for event in events if event["type"] == "stage_direction"),
+            "speeches": sum(1 for event in events if event["type"] == "speech"),
+            "scenes": len(scenes),
+            "acts": len(acts),
+            "source_words": count_words(raw_text),
+            "spoken_words": sum(count_words(event["text"]) for event in events if event["type"] == "speech"),
+        },
+    }
+def extract_title(events: Sequence[PlayEvent], *, play_name: str | None) -> str | None:
+    for event in events:
+        if event["type"] == "meta" and event["subtype"] == "title":
+            return event["text"]
+    return play_name
+def extract_author(events: Sequence[PlayEvent]) -> str | None:
+    for event in events:
+        if event["type"] == "meta" and event["subtype"] == "author":
+            author = event["text"].strip()
+            if author.lower().startswith(AUTHOR_BYLINE_PREFIX):
+                return author[len(AUTHOR_BYLINE_PREFIX) :].strip() or None
+            return author or None
+    return None
+def collect_characters(events: Sequence[PlayEvent]) -> list[str]:
+    characters: list[str] = []
+    seen: set[str] = set()
+    for event in events:
+        if event["type"] == "stage_direction" and event.get("subtype") == "character":
+            add_character(characters, seen, event["text"])
+        elif event["type"] == "speech":
+            add_character(characters, seen, event["speaker"])
+    return characters
+def add_character(characters: list[str], seen: set[str], value: str) -> None:
+    name = value.strip()
+    if not name:
+        return
+    marker = normalize_inline_match_text(name)
+    if marker in seen:
+        return
+    seen.add(marker)
+    characters.append(name)
+def collect_structure(events: Sequence[PlayEvent]) -> tuple[list[dict[str, object]], list[dict[str, object]]]:
+    acts: list[dict[str, object]] = []
+    scenes: list[dict[str, object]] = []
+    current_act_number: int | None = None
+    for event in events:
+        if event["type"] == "act_start":
+            current_act_number = event["act_number"]
+            acts.append(
+                {
+                    "act_number": event["act_number"],
+                    "label": event["label"],
+                }
+            )
+        elif event["type"] == "scene_start":
+            scenes.append(
+                {
+                    "act_number": current_act_number,
+                    "scene_number": event.get("scene_number"),
+                    "label": event["label"],
+                }
+            )
+    return acts, scenes
+def normalize_inline_match_text(text: str) -> str:
+    return WHITESPACE_RE.sub(" ", text).strip().casefold()

play_parser/document/constants.py ADDED Viewed

@@ -0,0 +1,17 @@
+from __future__ import annotations
+import re
+WORD_RE = re.compile(r"[0-9A-Za-zÀ-ÖØ-öø-ÿ]+(?:['’][0-9A-Za-zÀ-ÖØ-öø-ÿ]+)*")
+STAGE_DIRECTION_SUBTYPES = (
+    "enter",
+    "exit",
+    "action",
+    "aside",
+    "setting",
+    "sound",
+    "cast_list_heading",
+    "character",
+)
+VALID_STAGE_DIRECTION_SUBTYPES = frozenset(STAGE_DIRECTION_SUBTYPES)

play_parser/document/text.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""Shared text helpers for parser and document validation."""
+from __future__ import annotations
+from .constants import WORD_RE
+def count_words(text: str) -> int:
+    """Count words using the canonical parser word-token rule."""
+    return len(WORD_RE.findall(text))
+def normalise_whitespace(text: str) -> str:
+    """Collapse all runs of whitespace in ``text`` to single spaces."""
+    return " ".join(text.split())

play_parser/document/types.py ADDED Viewed

@@ -0,0 +1,92 @@
+from __future__ import annotations
+from typing import Literal, TypedDict
+class ActStartEvent(TypedDict):
+    type: Literal["act_start"]
+    act_number: int
+    label: str
+class SceneStartEvent(TypedDict):
+    type: Literal["scene_start"]
+    scene_number: int | None
+    label: str
+class SpeechEventRequired(TypedDict):
+    type: Literal["speech"]
+    speaker: str
+    text: str
+    line_count: int
+    word_count: int
+class SpeechEvent(SpeechEventRequired, total=False):
+    labelled: bool
+StageDirectionSubtype = Literal[
+    "enter",
+    "exit",
+    "action",
+    "aside",
+    "setting",
+    "sound",
+    "cast_list_heading",
+    "character",
+]
+class StageDirectionEventRequired(TypedDict):
+    type: Literal["stage_direction"]
+    subtype: StageDirectionSubtype
+    text: str
+class StageDirectionEvent(StageDirectionEventRequired, total=False):
+    attachment: Literal["next"]
+class MetaEvent(TypedDict):
+    type: Literal["meta"]
+    subtype: str
+    text: str
+class ActSummary(TypedDict):
+    act_number: int
+    label: str
+class SceneSummary(TypedDict):
+    act_number: int | None
+    scene_number: int | None
+    label: str
+class PlayStats(TypedDict):
+    stage_directions: int
+    speeches: int
+    scenes: int
+    acts: int
+    source_words: int
+    spoken_words: int
+class PlayMetadata(TypedDict):
+    title: str | None
+    author: str | None
+    characters: list[str]
+    acts: list[ActSummary]
+    scenes: list[SceneSummary]
+    stats: PlayStats
+class PlayDocument(TypedDict):
+    metadata: PlayMetadata
+    events: list[PlayEvent]
+PlayEvent = ActStartEvent | SceneStartEvent | SpeechEvent | StageDirectionEvent | MetaEvent

play_parser/document/validation.py ADDED Viewed

@@ -0,0 +1,250 @@
+"""
+Validation functions for play parser events and roundtrip testing.
+"""
+from collections.abc import Sequence
+from typing import cast
+from .assembler import assemble_play_text
+from .constants import VALID_STAGE_DIRECTION_SUBTYPES
+from .types import PlayDocument, PlayEvent, PlayMetadata
+DEFAULT_DIFF_CONTEXT_SIZE = 50
+REQUIRED_METADATA_FIELDS = {"title", "author", "characters", "acts", "scenes", "stats"}
+REQUIRED_ACT_SUMMARY_FIELDS = {"act_number", "label"}
+REQUIRED_SCENE_SUMMARY_FIELDS = {"act_number", "scene_number", "label"}
+REQUIRED_STATS_FIELDS = {"stage_directions", "speeches", "scenes", "acts", "source_words", "spoken_words"}
+def count_lines(text: str) -> int:
+    """Count the number of lines in text."""
+    if not text:
+        return 0
+    return len(text.splitlines())
+def normalize_for_comparison(text: str) -> str:
+    """Normalize text for comparison by removing all whitespace and converting to lowercase."""
+    return "".join(text.lower().split())
+def is_integer(value: object) -> bool:
+    return isinstance(value, int) and not isinstance(value, bool)
+def is_non_negative_integer(value: object) -> bool:
+    return is_integer(value) and value >= 0
+def require_fields(event: dict[str, object], required_fields: set[str], event_type: str) -> None:
+    missing = required_fields - event.keys()
+    if missing:
+        raise ValueError(f"{event_type} event missing required fields: {missing}")
+def validate_metadata(metadata: object) -> PlayMetadata:
+    if not isinstance(metadata, dict):
+        raise ValueError("Canonical play JSON must contain 'metadata' as an object")
+    require_metadata_fields(metadata)
+    validate_title_and_author(metadata)
+    validate_character_summaries(metadata["characters"])
+    validate_act_summaries(metadata["acts"])
+    validate_scene_summaries(metadata["scenes"])
+    validate_stats(metadata["stats"])
+    return cast(PlayMetadata, metadata)
+def require_metadata_fields(metadata: dict[str, object]) -> None:
+    missing = REQUIRED_METADATA_FIELDS - metadata.keys()
+    if missing:
+        raise ValueError(f"Canonical play JSON metadata is missing required fields: {sorted(missing)}")
+def validate_title_and_author(metadata: dict[str, object]) -> None:
+    if metadata["title"] is not None and not isinstance(metadata["title"], str):
+        raise ValueError("Canonical play JSON metadata.title must be a string or null")
+    if metadata["author"] is not None and not isinstance(metadata["author"], str):
+        raise ValueError("Canonical play JSON metadata.author must be a string or null")
+def validate_character_summaries(characters: object) -> None:
+    if not isinstance(characters, list) or not all(isinstance(item, str) for item in characters):
+        raise ValueError("Canonical play JSON metadata.characters must be a list of strings")
+def validate_act_summaries(acts: object) -> None:
+    if not isinstance(acts, list):
+        raise ValueError("Canonical play JSON metadata.acts must be a list")
+    for index, act in enumerate(acts):
+        validate_act_summary(act, index)
+def validate_act_summary(act: object, index: int) -> None:
+    if not isinstance(act, dict):
+        raise ValueError(f"Canonical play JSON metadata.acts[{index}] must be an object")
+    missing_act_fields = REQUIRED_ACT_SUMMARY_FIELDS - act.keys()
+    if missing_act_fields:
+        raise ValueError(
+            f"Canonical play JSON metadata.acts[{index}] is missing required fields: {sorted(missing_act_fields)}"
+        )
+    if not is_integer(act["act_number"]):
+        raise ValueError(f"Canonical play JSON metadata.acts[{index}].act_number must be an integer")
+    if not isinstance(act["label"], str):
+        raise ValueError(f"Canonical play JSON metadata.acts[{index}].label must be a string")
+def validate_scene_summaries(scenes: object) -> None:
+    if not isinstance(scenes, list):
+        raise ValueError("Canonical play JSON metadata.scenes must be a list")
+    for index, scene in enumerate(scenes):
+        validate_scene_summary(scene, index)
+def validate_scene_summary(scene: object, index: int) -> None:
+    if not isinstance(scene, dict):
+        raise ValueError(f"Canonical play JSON metadata.scenes[{index}] must be an object")
+    missing_scene_fields = REQUIRED_SCENE_SUMMARY_FIELDS - scene.keys()
+    if missing_scene_fields:
+        raise ValueError(
+            f"Canonical play JSON metadata.scenes[{index}] is missing required fields: {sorted(missing_scene_fields)}"
+        )
+    if scene["act_number"] is not None and not is_integer(scene["act_number"]):
+        raise ValueError(f"Canonical play JSON metadata.scenes[{index}].act_number must be an integer or null")
+    if scene["scene_number"] is not None and not is_integer(scene["scene_number"]):
+        raise ValueError(f"Canonical play JSON metadata.scenes[{index}].scene_number must be an integer or null")
+    if not isinstance(scene["label"], str):
+        raise ValueError(f"Canonical play JSON metadata.scenes[{index}].label must be a string")
+def validate_stats(stats: object) -> None:
+    if not isinstance(stats, dict):
+        raise ValueError("Canonical play JSON metadata.stats must be an object")
+    missing_stats_fields = REQUIRED_STATS_FIELDS - stats.keys()
+    if missing_stats_fields:
+        raise ValueError(
+            f"Canonical play JSON metadata.stats is missing required fields: {sorted(missing_stats_fields)}"
+        )
+    for field_name in sorted(REQUIRED_STATS_FIELDS):
+        value = stats[field_name]
+        if not is_integer(value):
+            raise ValueError(f"Canonical play JSON metadata.stats.{field_name} must be an integer")
+def validate_play_document(document: object) -> PlayDocument:
+    if not isinstance(document, dict):
+        raise ValueError("Canonical play JSON must be an object with 'metadata' and 'events'")
+    metadata = document.get("metadata")
+    events = document.get("events")
+    if not isinstance(metadata, dict) or not isinstance(events, list):
+        raise ValueError("Canonical play JSON must contain 'metadata' and 'events'")
+    validated_metadata = validate_metadata(metadata)
+    validate_events(events)
+    return {
+        "metadata": validated_metadata,
+        "events": cast(list[PlayEvent], events),
+    }
+def validate_act_start_event(event: dict[str, object]) -> None:
+    require_fields(event, {"type", "act_number", "label"}, "act_start")
+    if not is_integer(event["act_number"]):
+        raise ValueError("act_start.act_number must be an integer")
+def validate_scene_start_event(event: dict[str, object]) -> None:
+    require_fields(event, {"type", "label"}, "scene_start")
+    if "scene_number" in event and event["scene_number"] is not None and not is_integer(event["scene_number"]):
+        raise ValueError("scene_start.scene_number must be an integer or null")
+def validate_speech_event(event: dict[str, object]) -> None:
+    require_fields(event, {"type", "speaker", "text", "line_count", "word_count"}, "speech")
+    if not event["speaker"]:
+        raise ValueError("speech event must have non-empty speaker")
+    if not is_non_negative_integer(event["line_count"]):
+        raise ValueError("speech.line_count must be a non-negative integer")
+    if not is_non_negative_integer(event["word_count"]):
+        raise ValueError("speech.word_count must be a non-negative integer")
+    if "labelled" in event and not isinstance(event["labelled"], bool):
+        raise ValueError("speech.labelled must be a boolean when present")
+def validate_stage_direction_event(event: dict[str, object]) -> None:
+    require_fields(event, {"type", "subtype", "text"}, "stage_direction")
+    if not isinstance(event["subtype"], str) or event["subtype"] not in VALID_STAGE_DIRECTION_SUBTYPES:
+        raise ValueError(f"Invalid stage_direction subtype: {event['subtype']}")
+    if "attachment" in event and event["attachment"] is not None and event["attachment"] != "next":
+        raise ValueError("stage_direction.attachment must be 'next' when present")
+def validate_meta_event(event: dict[str, object]) -> None:
+    require_fields(event, {"type", "subtype", "text"}, "meta")
+_EVENT_VALIDATORS = {
+    "act_start": validate_act_start_event,
+    "scene_start": validate_scene_start_event,
+    "speech": validate_speech_event,
+    "stage_direction": validate_stage_direction_event,
+    "meta": validate_meta_event,
+}
+def validate_event_schema(event: PlayEvent) -> None:
+    """Validate that an event conforms to the expected schema."""
+    if not isinstance(event, dict):
+        raise ValueError(f"Event must be a dict, got {type(event)}")
+    if "type" not in event:
+        raise ValueError("Event must have 'type' field")
+    event_type = event["type"]
+    validator = _EVENT_VALIDATORS.get(event_type)
+    if validator is None:
+        raise ValueError(f"Unknown event type: {event_type}")
+    validator(cast(dict[str, object], event))
+def validate_events(events: Sequence[PlayEvent]) -> None:
+    for index, event in enumerate(events):
+        try:
+            validate_event_schema(event)
+        except ValueError as exc:
+            raise ValueError(f"Invalid event at index {index}: {exc}") from exc
+def validate_roundtrip(original_text: str, events: Sequence[PlayEvent]) -> bool:
+    assembled = assemble_play_text(events)
+    return normalize_for_comparison(original_text) == normalize_for_comparison(assembled)
+def diff_context(original: str, assembled: str, context_size: int = DEFAULT_DIFF_CONTEXT_SIZE) -> str:
+    original_norm = normalize_for_comparison(original)
+    assembled_norm = normalize_for_comparison(assembled)
+    min_len = min(len(original_norm), len(assembled_norm))
+    diff_pos = None
+    for i in range(min_len):
+        if original_norm[i] != assembled_norm[i]:
+            diff_pos = i
+            break
+    if diff_pos is None and len(original_norm) != len(assembled_norm):
+        diff_pos = min_len
+    if diff_pos is None:
+        return "No differences found"
+    start = max(0, diff_pos - context_size)
+    end = diff_pos + context_size
+    return (
+        f"Difference at normalized position {diff_pos}:\n"
+        f"Original:  ...{original_norm[start:end]}...\n"
+        f"Assembled: ...{assembled_norm[start:end]}..."
+    )

play_parser/domain/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from .play import Act, Character, Dialogue, Monologue, Play, Scene, Speech
+__all__ = [
+    "Act",
+    "Scene",
+    "Dialogue",
+    "Character",
+    "Speech",
+    "Monologue",
+    "Play",
+]