txtdown 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
txtdown/__init__.py ADDED
@@ -0,0 +1,29 @@
1
+ """txtdown: Minimal markup for Latin text collections.
2
+
3
+ Example usage:
4
+ >>> from txtdown import parse
5
+ >>> doc = parse("sulpicia.txtd")
6
+ >>> print(doc.metadata.author)
7
+ Sulpicia
8
+ >>> print(doc.sections[0].lines[0].text)
9
+ Tandem venit amor, qualem texisse pudori
10
+ >>> line = doc.get("1.3")
11
+ >>> print(line.text)
12
+ exorata meis illum Cytherea Camenis
13
+ """
14
+
15
+ from .models import Document, Line, Metadata, Section
16
+ from .parser import parse
17
+ from .writer import write
18
+
19
+ __version__ = "0.2.0"
20
+
21
+ __all__ = [
22
+ "Document",
23
+ "Line",
24
+ "Metadata",
25
+ "Section",
26
+ "parse",
27
+ "write",
28
+ "__version__",
29
+ ]
txtdown/models.py ADDED
@@ -0,0 +1,181 @@
1
+ """Data models for txtdown documents."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Any
5
+
6
+
7
+ @dataclass
8
+ class Metadata:
9
+ """Document metadata from YAML front matter.
10
+
11
+ Attributes:
12
+ author: Author name
13
+ work: Work title
14
+ source: Source URL or reference
15
+ scope: For partial files (e.g., "1" or "1-12")
16
+ extras: Additional key-value pairs
17
+ """
18
+ author: str | None = None
19
+ work: str | None = None
20
+ source: str | None = None
21
+ scope: str | None = None
22
+ extras: dict[str, Any] = field(default_factory=dict)
23
+
24
+ @classmethod
25
+ def from_dict(cls, data: dict[str, Any]) -> "Metadata":
26
+ """Create Metadata from a dictionary (e.g., parsed YAML)."""
27
+ known_fields = {"author", "work", "source", "scope"}
28
+ extras = {k: v for k, v in data.items() if k not in known_fields}
29
+ # Ensure scope is always a string (YAML may parse "1" as int)
30
+ scope = data.get("scope")
31
+ if scope is not None:
32
+ scope = str(scope)
33
+ return cls(
34
+ author=data.get("author"),
35
+ work=data.get("work"),
36
+ source=data.get("source"),
37
+ scope=scope,
38
+ extras=extras,
39
+ )
40
+
41
+ def to_dict(self) -> dict[str, Any]:
42
+ """Convert to dictionary for YAML serialization."""
43
+ result: dict[str, Any] = {}
44
+ if self.author:
45
+ result["author"] = self.author
46
+ if self.work:
47
+ result["work"] = self.work
48
+ if self.source:
49
+ result["source"] = self.source
50
+ if self.scope:
51
+ result["scope"] = self.scope
52
+ result.update(self.extras)
53
+ return result
54
+
55
+
56
+ @dataclass
57
+ class Line:
58
+ """A single line of text.
59
+
60
+ Attributes:
61
+ text: The line content
62
+ number: Line number within the section (1-indexed)
63
+ speaker: Speaker name for dramatic texts (None for non-dialogue)
64
+ label: Editorial line label when it differs from number (e.g., "983a")
65
+ is_quote: True if the line is a cross-source quotation (``>`` markup),
66
+ i.e. verbatim text quoted from another author/work
67
+ """
68
+ text: str
69
+ number: int
70
+ speaker: str | None = None
71
+ label: str | None = None
72
+ is_quote: bool = False
73
+
74
+ def __str__(self) -> str:
75
+ return self.text
76
+
77
+
78
+ @dataclass
79
+ class Section:
80
+ """A section of text (poem, chapter, etc.).
81
+
82
+ Attributes:
83
+ id: Section identifier (number or name)
84
+ lines: List of lines in this section
85
+ is_numbered: Whether the ID is a number (vs. a name)
86
+ title: Optional section title
87
+ metadata: Section-specific metadata (supersedes document metadata)
88
+
89
+ Note:
90
+ Indexing with [] uses 1-based indexing to match scholarly citations.
91
+ Use section[1] for the first line, not section[0].
92
+ """
93
+ id: str
94
+ lines: list[Line] = field(default_factory=list)
95
+ is_numbered: bool = True
96
+ title: str | None = None
97
+ metadata: dict[str, Any] = field(default_factory=dict)
98
+
99
+ @property
100
+ def text(self) -> str:
101
+ """Return section text as a single string."""
102
+ return "\n".join(line.text for line in self.lines)
103
+
104
+ def __len__(self) -> int:
105
+ return len(self.lines)
106
+
107
+ def __getitem__(self, index: int) -> Line:
108
+ """Get line by 1-indexed number."""
109
+ if index < 1 or index > len(self.lines):
110
+ raise IndexError(f"Line {index} out of range (1-{len(self.lines)})")
111
+ return self.lines[index - 1]
112
+
113
+
114
+ @dataclass
115
+ class Document:
116
+ """A complete txtdown document.
117
+
118
+ Attributes:
119
+ metadata: Document metadata
120
+ sections: List of sections
121
+
122
+ Note:
123
+ Indexing with [] uses 1-based indexing to match scholarly citations.
124
+ Use doc[1] for the first section, not doc[0].
125
+ For citation-based access, use doc.get("1") or doc.get("1.3").
126
+ """
127
+ metadata: Metadata = field(default_factory=Metadata)
128
+ sections: list[Section] = field(default_factory=list)
129
+
130
+ def get(self, citation: str) -> Line | Section:
131
+ """Retrieve content by citation.
132
+
133
+ Args:
134
+ citation: Citation string like "2" (section) or "2.3" (section.line)
135
+
136
+ Returns:
137
+ Section if single-level citation, Line if two-level
138
+
139
+ Raises:
140
+ KeyError: If section or line not found
141
+ """
142
+ parts = citation.split(".")
143
+
144
+ # Find section
145
+ section_id = parts[0]
146
+ section = None
147
+ for s in self.sections:
148
+ if s.id == section_id:
149
+ section = s
150
+ break
151
+
152
+ if section is None:
153
+ raise KeyError(f"Section '{section_id}' not found")
154
+
155
+ # Return section or line
156
+ if len(parts) == 1:
157
+ return section
158
+
159
+ line_ref = parts[1]
160
+
161
+ # Try label lookup first (handles "983a" etc.)
162
+ for line in section.lines:
163
+ if line.label == line_ref:
164
+ return line
165
+
166
+ # Fall back to numeric line number
167
+ try:
168
+ line_num = int(line_ref)
169
+ return section[line_num]
170
+ except (ValueError, IndexError) as e:
171
+ msg = f"Line '{line_ref}' not found in section '{section_id}'"
172
+ raise KeyError(msg) from e
173
+
174
+ def __len__(self) -> int:
175
+ return len(self.sections)
176
+
177
+ def __getitem__(self, index: int) -> Section:
178
+ """Get section by 1-indexed number."""
179
+ if index < 1 or index > len(self.sections):
180
+ raise IndexError(f"Section {index} out of range (1-{len(self.sections)})")
181
+ return self.sections[index - 1]
txtdown/parser.py ADDED
@@ -0,0 +1,379 @@
1
+ """Parser for txtdown format."""
2
+
3
+ import re
4
+ import warnings
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ import yaml
9
+
10
+ from .models import Document, Line, Metadata, Section
11
+
12
+ # Pattern for section separator: --- optionally followed by ID
13
+ # Must be at start of line, at least 3 dashes
14
+ SECTION_SEP_PATTERN = re.compile(r"^-{3,}\s*(.*)$")
15
+
16
+ # Pattern for speaker markup: @SingleWord: speech text
17
+ SPEAKER_PATTERN = re.compile(r"^@(\w+):\s*(.*)")
18
+
19
+ # Pattern for cross-source quotation: > verbatim quoted text
20
+ # The single optional space after > is part of the marker, not the text.
21
+ QUOTE_PATTERN = re.compile(r"^>\s?(.*)")
22
+
23
+ # Pattern for leading explicit line number: "6. text" or "983. text"
24
+ LEADING_NUMBER_PATTERN = re.compile(r"^(\d+)\.\s+(.*)")
25
+
26
+ # Pattern for trailing line label: "text 980" or "text 983a"
27
+ # Requires 2+ whitespace chars before the label to avoid false positives
28
+ TRAILING_LABEL_PATTERN = re.compile(r"^(.*?)\s{2,}(\d+[a-z]?)\s*$")
29
+
30
+
31
+ def parse(source: str | Path, *, strict: bool = True) -> Document:
32
+ """Parse a txtdown file or string.
33
+
34
+ Args:
35
+ source: File path or txtdown content string
36
+ strict: When True (default), require a YAML front matter block with a
37
+ ``work`` field and raise ValueError if either is missing. Pass
38
+ ``strict=False`` to parse a fragment (e.g. a single line or section)
39
+ without metadata.
40
+
41
+ Returns:
42
+ Parsed Document object
43
+
44
+ Raises:
45
+ ValueError: In strict mode, when the front matter block or the ``work``
46
+ field is missing.
47
+ """
48
+ # Handle file path vs string
49
+ is_path = isinstance(source, Path)
50
+ is_path = is_path or (isinstance(source, str) and _looks_like_path(source))
51
+ if is_path:
52
+ path = Path(source)
53
+ content = path.read_text(encoding="utf-8")
54
+ else:
55
+ content = source
56
+
57
+ return _parse_content(content, strict=strict)
58
+
59
+
60
+ def _looks_like_path(s: str) -> bool:
61
+ """Heuristic to detect if string is a file path."""
62
+ # Empty string is not a path
63
+ if not s or not s.strip():
64
+ return False
65
+ # If it starts with ---, it's content
66
+ if s.strip().startswith("---"):
67
+ return False
68
+ # If it contains newlines, it's content
69
+ if "\n" in s:
70
+ return False
71
+ # If it ends with .txtdown or .td, it's a path
72
+ if s.endswith((".txtd", ".txtdown")):
73
+ return True
74
+ # If it exists as a file (not directory), it's a path
75
+ p = Path(s)
76
+ return p.exists() and p.is_file()
77
+
78
+
79
+ def _parse_content(content: str, strict: bool = True) -> Document:
80
+ """Parse txtdown content string."""
81
+ lines = content.split("\n")
82
+
83
+ # Extract front matter (body_start > 0 only when a closed block was found)
84
+ metadata, body_start = _parse_front_matter(lines)
85
+ had_front_matter = body_start > 0
86
+
87
+ # Parse body into sections
88
+ sections = _parse_sections(lines[body_start:])
89
+
90
+ doc = Document(metadata=metadata, sections=sections)
91
+
92
+ if strict:
93
+ _validate(doc, had_front_matter)
94
+
95
+ return doc
96
+
97
+
98
+ def _validate(doc: Document, had_front_matter: bool) -> None:
99
+ """Enforce the required document structure in strict mode."""
100
+ if not had_front_matter:
101
+ raise ValueError(
102
+ "txtdown requires a YAML front matter block (--- ... ---). "
103
+ "Pass strict=False to parse a fragment without metadata."
104
+ )
105
+ if not doc.metadata.work:
106
+ raise ValueError(
107
+ "txtdown requires a 'work' field in the front matter. "
108
+ "Pass strict=False to parse without it."
109
+ )
110
+
111
+
112
+ def _parse_front_matter(lines: list[str]) -> tuple[Metadata, int]:
113
+ """Parse YAML front matter.
114
+
115
+ Returns:
116
+ Tuple of (Metadata, index of first body line)
117
+ """
118
+ # Find opening ---
119
+ start = 0
120
+ while start < len(lines) and not lines[start].strip():
121
+ start += 1
122
+
123
+ if start >= len(lines) or lines[start].strip() != "---":
124
+ return Metadata(), 0
125
+
126
+ # Find closing ---
127
+ end = start + 1
128
+ while end < len(lines):
129
+ line = lines[end].strip()
130
+ if line == "---" or line == "...":
131
+ break
132
+ end += 1
133
+
134
+ if end >= len(lines):
135
+ # No closing delimiter - treat as no front matter
136
+ return Metadata(), 0
137
+
138
+ # Parse YAML
139
+ yaml_content = "\n".join(lines[start + 1 : end])
140
+ try:
141
+ data = yaml.safe_load(yaml_content) or {}
142
+ except yaml.YAMLError as e:
143
+ warnings.warn(f"Failed to parse YAML front matter: {e}", stacklevel=3)
144
+ data = {}
145
+
146
+ return Metadata.from_dict(data), end + 1
147
+
148
+
149
+ def _parse_section_header(header: str) -> tuple[str | None, str | None]:
150
+ """Parse section header into ID and title.
151
+
152
+ Formats supported:
153
+ "99" -> id="99", title=None
154
+ "99: Title here" -> id="99", title="Title here"
155
+ "prooemium" -> id="prooemium", title=None
156
+ "prooemium: Introduction" -> id="prooemium", title="Introduction"
157
+
158
+ Returns:
159
+ Tuple of (id, title), either may be None.
160
+ """
161
+ if not header:
162
+ return None, None
163
+
164
+ # Check for "id: title" format
165
+ if ":" in header:
166
+ id_part, title_part = header.split(":", 1)
167
+ return id_part.strip(), title_part.strip() or None
168
+
169
+ return header.strip(), None
170
+
171
+
172
+ def _is_metadata_line(line: str) -> bool:
173
+ """Check if a line looks like YAML metadata (key: value)."""
174
+ stripped = line.strip()
175
+ if not stripped:
176
+ return False
177
+ # Must have colon with content on both sides
178
+ if ":" not in stripped:
179
+ return False
180
+ # Split on first colon
181
+ key, _, value = stripped.partition(":")
182
+ # Key must be a simple identifier (no spaces, alphanumeric + underscore)
183
+ if not key or not re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", key):
184
+ return False
185
+ return True
186
+
187
+
188
+ def _parse_section_metadata(
189
+ lines: list[str], start_idx: int
190
+ ) -> tuple[dict[str, Any], int]:
191
+ """Parse section metadata from lines immediately following section separator.
192
+
193
+ Args:
194
+ lines: All lines in the section
195
+ start_idx: Index to start looking for metadata
196
+
197
+ Returns:
198
+ Tuple of (metadata dict, index of first content line)
199
+ """
200
+ metadata: dict[str, Any] = {}
201
+ idx = start_idx
202
+
203
+ # Skip any leading blank lines - metadata must immediately follow separator
204
+ # Actually, no - metadata must IMMEDIATELY abut the separator (no blank line)
205
+ # So if first line is blank, there's no section metadata
206
+
207
+ while idx < len(lines):
208
+ line = lines[idx]
209
+
210
+ # Blank line signals end of metadata, start of content
211
+ if not line.strip():
212
+ break
213
+
214
+ # Check if this looks like metadata
215
+ if _is_metadata_line(line):
216
+ key, _, value = line.strip().partition(":")
217
+ value = value.strip()
218
+ # Try to parse as YAML-ish value (bool, int, etc.)
219
+ if value.lower() == "true":
220
+ metadata[key] = True
221
+ elif value.lower() == "false":
222
+ metadata[key] = False
223
+ elif value.isdigit():
224
+ metadata[key] = int(value)
225
+ else:
226
+ metadata[key] = value
227
+ idx += 1
228
+ else:
229
+ # Not metadata - this is content
230
+ break
231
+
232
+ return metadata, idx
233
+
234
+
235
+ def _parse_sections(lines: list[str]) -> list[Section]:
236
+ """Parse body into sections."""
237
+ sections: list[Section] = []
238
+ current_lines: list[str] = []
239
+ current_id: str | None = None
240
+ current_title: str | None = None
241
+ current_metadata: dict[str, Any] = {}
242
+ section_counter = 0
243
+
244
+ def has_content(lines: list[str]) -> bool:
245
+ """Check if lines have any non-whitespace content."""
246
+ return any(line.strip() for line in lines)
247
+
248
+ i = 0
249
+ while i < len(lines):
250
+ line = lines[i]
251
+ match = SECTION_SEP_PATTERN.match(line)
252
+ if match:
253
+ # Save previous section only if it has actual content
254
+ if has_content(current_lines):
255
+ section_counter += 1
256
+ section_id = current_id if current_id else str(section_counter)
257
+ sec = _make_section(
258
+ section_id, current_lines, current_title, current_metadata
259
+ )
260
+ sections.append(sec)
261
+ current_lines = []
262
+ current_metadata = {}
263
+
264
+ # Extract ID and title from separator line
265
+ header = match.group(1).strip()
266
+ current_id, current_title = _parse_section_header(header)
267
+
268
+ # Check for section metadata immediately following separator
269
+ i += 1
270
+ if i < len(lines):
271
+ current_metadata, content_start = _parse_section_metadata(lines, i)
272
+ i = content_start
273
+ continue # Don't increment i again at end of loop
274
+ else:
275
+ current_lines.append(line)
276
+
277
+ i += 1
278
+
279
+ # Don't forget the last section
280
+ if has_content(current_lines):
281
+ section_counter += 1
282
+ section_id = current_id if current_id else str(section_counter)
283
+ sec = _make_section(
284
+ section_id, current_lines, current_title, current_metadata
285
+ )
286
+ sections.append(sec)
287
+
288
+ return sections
289
+
290
+
291
+ def _make_section(
292
+ section_id: str,
293
+ raw_lines: list[str],
294
+ title: str | None = None,
295
+ metadata: dict[str, Any] | None = None,
296
+ ) -> Section:
297
+ """Create a Section from raw lines."""
298
+ # Strip leading/trailing blank lines
299
+ while raw_lines and not raw_lines[0].strip():
300
+ raw_lines.pop(0)
301
+ while raw_lines and not raw_lines[-1].strip():
302
+ raw_lines.pop()
303
+
304
+ # Create numbered Line objects (only for non-empty lines)
305
+ lines: list[Line] = []
306
+ last_number = 0
307
+ for text in raw_lines:
308
+ if text.strip(): # Skip blank lines for numbering
309
+ # Cross-source quotation: > marks verbatim text quoted from another
310
+ # source. Quoted text is preserved as-is (no number/label extraction).
311
+ quote_match = QUOTE_PATTERN.match(text.lstrip())
312
+ if quote_match:
313
+ number = last_number + 1
314
+ last_number = number
315
+ lines.append(
316
+ Line(text=quote_match.group(1), number=number, is_quote=True)
317
+ )
318
+ continue
319
+
320
+ text, number, label = _extract_line_numbering(text, last_number)
321
+ last_number = number
322
+
323
+ speaker_match = SPEAKER_PATTERN.match(text)
324
+ if speaker_match:
325
+ speaker = speaker_match.group(1)
326
+ speech = speaker_match.group(2)
327
+ lines.append(
328
+ Line(text=speech, number=number, speaker=speaker, label=label)
329
+ )
330
+ else:
331
+ lines.append(Line(text=text, number=number, label=label))
332
+
333
+ # Determine if ID is numeric
334
+ is_numbered = section_id.isdigit()
335
+
336
+ return Section(
337
+ id=section_id,
338
+ lines=lines,
339
+ is_numbered=is_numbered,
340
+ title=title,
341
+ metadata=metadata or {},
342
+ )
343
+
344
+
345
+ def _extract_line_numbering(
346
+ text: str, last_number: int
347
+ ) -> tuple[str, int, str | None]:
348
+ """Extract explicit line numbering from a line of text.
349
+
350
+ Handles three styles:
351
+ - Leading prefix: "6. suave etiam..." → number=6, text="suave etiam..."
352
+ - Trailing label: "servo id; 980" → number=auto, label="980"
353
+ - Implicit: auto-increment from last_number
354
+
355
+ Args:
356
+ text: Raw line text
357
+ last_number: Previous line's number (for auto-increment)
358
+
359
+ Returns:
360
+ Tuple of (cleaned_text, number, label)
361
+ """
362
+ label: str | None = None
363
+
364
+ # Check for trailing label first (e.g., "text 980" or "983a")
365
+ trailing_match = TRAILING_LABEL_PATTERN.match(text)
366
+ if trailing_match:
367
+ text = trailing_match.group(1).rstrip()
368
+ label = trailing_match.group(2)
369
+
370
+ # Check for leading explicit number (e.g., "6. text")
371
+ leading_match = LEADING_NUMBER_PATTERN.match(text)
372
+ if leading_match:
373
+ number = int(leading_match.group(1))
374
+ text = leading_match.group(2)
375
+ return text, number, label
376
+
377
+ # Implicit: auto-increment
378
+ number = last_number + 1
379
+ return text, number, label
txtdown/writer.py ADDED
@@ -0,0 +1,106 @@
1
+ """Writer for txtdown format."""
2
+
3
+ from pathlib import Path
4
+
5
+ import yaml
6
+
7
+ from .models import Document
8
+
9
+
10
+ def write(doc: Document, path: str | Path | None = None) -> str:
11
+ """Write a Document to txtdown format.
12
+
13
+ Args:
14
+ doc: Document to serialize
15
+ path: Optional file path to write to
16
+
17
+ Returns:
18
+ The txtdown content as a string
19
+ """
20
+ content = _serialize(doc)
21
+
22
+ if path is not None:
23
+ Path(path).write_text(content, encoding="utf-8")
24
+
25
+ return content
26
+
27
+
28
+ def _serialize(doc: Document) -> str:
29
+ """Serialize Document to txtdown string."""
30
+ parts: list[str] = []
31
+
32
+ # Front matter
33
+ meta_dict = doc.metadata.to_dict()
34
+ if meta_dict:
35
+ parts.append("---")
36
+ # Use yaml.dump with default_flow_style=False for readable output
37
+ yaml_str = yaml.dump(meta_dict, default_flow_style=False, allow_unicode=True)
38
+ parts.append(yaml_str.rstrip())
39
+ parts.append("---")
40
+ parts.append("")
41
+
42
+ # Sections
43
+ for i, section in enumerate(doc.sections):
44
+ # Section separator (except before first section)
45
+ if i > 0:
46
+ parts.append("")
47
+ parts.append("---")
48
+
49
+ # Add explicit ID if section has non-numeric or non-sequential ID
50
+ expected_id = str(i + 1)
51
+ needs_header = section.id != expected_id or section.title
52
+ if needs_header:
53
+ # Build header: "--- id" or "--- id: title"
54
+ if section.title:
55
+ header = f"--- {section.id}: {section.title}"
56
+ else:
57
+ header = f"--- {section.id}"
58
+ # Rewrite the separator with ID/title
59
+ if i > 0:
60
+ parts[-1] = header
61
+ else:
62
+ # First section with explicit ID
63
+ parts.append(header)
64
+
65
+ # Section metadata (immediately after separator, no blank line)
66
+ if section.metadata:
67
+ for key, value in section.metadata.items():
68
+ if isinstance(value, bool):
69
+ value_str = "true" if value else "false"
70
+ else:
71
+ value_str = str(value)
72
+ parts.append(f"{key}: {value_str}")
73
+
74
+ # Blank line before content
75
+ parts.append("")
76
+
77
+ # Section content
78
+ auto_number = 0
79
+ for line in section.lines:
80
+ auto_number += 1
81
+
82
+ # Build text with speaker or quote markup if needed
83
+ if line.is_quote:
84
+ text = f"> {line.text}"
85
+ elif line.speaker:
86
+ text = f"@{line.speaker}: {line.text}"
87
+ else:
88
+ text = line.text
89
+
90
+ # Add leading prefix if number differs from auto-increment
91
+ if line.number != auto_number:
92
+ text = f"{line.number}. {text}"
93
+ auto_number = line.number
94
+
95
+ # Add trailing label if present
96
+ if line.label:
97
+ text = f"{text} {line.label}"
98
+
99
+ parts.append(text)
100
+
101
+ # Ensure trailing newline
102
+ content = "\n".join(parts)
103
+ if not content.endswith("\n"):
104
+ content += "\n"
105
+
106
+ return content
@@ -0,0 +1,214 @@
1
+ Metadata-Version: 2.4
2
+ Name: txtdown
3
+ Version: 0.2.0
4
+ Summary: Minimal markup for Latin text collections
5
+ Author: Patrick J. Burns
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/diyclassics/txtdown
8
+ Project-URL: Repository, https://github.com/diyclassics/txtdown
9
+ Project-URL: Changelog, https://github.com/diyclassics/txtdown/blob/main/CHANGELOG.md
10
+ Project-URL: Issues, https://github.com/diyclassics/txtdown/issues
11
+ Keywords: latin,markup,text,philology,digital-humanities,nlp
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Text Processing :: Markup
20
+ Requires-Python: >=3.10
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: pyyaml>=6.0
24
+ Provides-Extra: dev
25
+ Requires-Dist: pytest>=7.0; extra == "dev"
26
+ Requires-Dist: pytest-cov>=4.0; extra == "dev"
27
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
28
+ Dynamic: license-file
29
+
30
+ # txtdown
31
+
32
+ Minimal markup for Latin text collections using human-readable markup with inferrable hierarchical structure for scholarly citation.
33
+
34
+ ## Installation
35
+
36
+ ```bash
37
+ pip install git+https://github.com/diyclassics/txtdown.git
38
+ ```
39
+
40
+ ## Quick Start
41
+
42
+ ```python
43
+ from txtdown import parse, write
44
+
45
+ # Parse a .txtd file
46
+ doc = parse("sulpicia.txtd")
47
+
48
+ # Access metadata
49
+ print(doc.metadata.author) # "Sulpicia"
50
+ print(doc.metadata.work) # "Epistulae"
51
+
52
+ # Access by citation
53
+ line = doc.get("2.3") # Section 2, line 3
54
+ section = doc.get("1") # Entire section 1
55
+
56
+ # Iterate sections and lines
57
+ for section in doc.sections:
58
+ for line in section.lines:
59
+ print(f"{section.id}.{line.number}: {line.text}")
60
+
61
+ # Write back to file (round-trip safe)
62
+ write(doc, "output.txtd")
63
+ ```
64
+
65
+ ## Format Specification
66
+
67
+ A `.txtd` file consists of a YAML front matter block followed by sections separated by horizontal rules (`---`). The front matter block is required and must include a `work` field; `parse()` raises `ValueError` otherwise. To parse a fragment without metadata (e.g. a single line or section), pass `strict=False`.
68
+
69
+ ### Basic Structure
70
+
71
+ ```
72
+ ---
73
+ author: Sulpicia
74
+ work: Epistulae
75
+ source: https://thelatinlibrary.com/sulpicia.html
76
+ ---
77
+
78
+ --- 1
79
+
80
+ Tandem venit amor, qualem texisse pudori
81
+ quam nudasse alicui sit mihi fama magis.
82
+ exorata meis illum Cytherea Camenis
83
+ attulit in nostrum deposuitque sinum.
84
+ etc.
85
+
86
+ --- 2
87
+
88
+ Invisus natalis adest, qui rure molesto
89
+ et sine Cerintho tristis agendus erit.
90
+ etc.
91
+ ```
92
+
93
+ ### Sections
94
+
95
+ - Sections are separated by `---` (three or more hyphens)
96
+ - Sections auto-number (1, 2, 3...) unless given explicit IDs (best practice)
97
+ - Explicit section ID: `--- prooemium` or `--- 1a`
98
+ - Section with title: `--- prooemium: Introduction`
99
+
100
+ ### Lines (for verse)
101
+
102
+ - Lines auto-number within each section (1, 2, 3...)
103
+ - Blank lines don't count toward line numbering
104
+ - Access via citation: `doc.get("2.3")` returns section 2, line 3
105
+
106
+ **Line indentation** (`mode: verse`): Leading whitespace indicates poetic structure (e.g., pentameter lines in elegiac couplets):
107
+
108
+ ```
109
+ Tandem venit amor, qualem texisse pudori
110
+ quam nudasse alicui sit mihi fama magis.
111
+ ```
112
+
113
+ The parser preserves indentation. For NLP, TxtdownReader strips leading whitespace when joining lines for sentence segmentation.
114
+
115
+ ### Speaker Markup (dramatic texts)
116
+
117
+ For dramatic texts, use `@Speaker:` at the start of a line to mark speaker attribution:
118
+
119
+ ```
120
+ @Diocletianus: Quid sibi vult ista, quae vos agitat, fatuitas?
121
+ @Agapes: quod signum fatuitatis nobis inesse deprehendis?
122
+ @Diocletianus: Evidens magnumque.
123
+ ```
124
+
125
+ The parser extracts the speaker name into `line.speaker` and keeps `line.text` as pure speech text — ideal for NLP pipelines that need clean text without markup.
126
+
127
+ ```python
128
+ doc = parse("dulcitius.txtd")
129
+ for line in doc.sections[0].lines:
130
+ print(f"{line.speaker}: {line.text}")
131
+ # Diocletianus: Quid sibi vult ista...
132
+ ```
133
+
134
+ Non-speaker lines (stage directions, prose) have `line.speaker = None`. Speaker markup round-trips through `write()`.
135
+
136
+ ### Cross-source Quotation
137
+
138
+ Use `>` at the start of a line to mark text quoted verbatim from *another* literary
139
+ source — an author embedding a poet's verse in their own prose, for example. This
140
+ repurposes the familiar blockquote convention for the citational habits of classical texts:
141
+
142
+ ```
143
+ Quamquam Ennius recte:
144
+
145
+ > Amicus certus in re incerta cernitur,
146
+
147
+ tamen haec duo levitatis et infirmitatis plerosque convincunt.
148
+ ```
149
+
150
+ The parser strips the `>` marker and flags the line with `line.is_quote = True`, keeping
151
+ `line.text` as clean quoted text. Consecutive `>` lines form a multi-line quotation:
152
+
153
+ ```
154
+ > Negat quis, nego; ait, aio; postremo imperavi egomet mihi
155
+ > Omnia adsentari,
156
+ ```
157
+
158
+ ```python
159
+ doc = parse("cicero-de-amicitia.txtd")
160
+ quotes = [line.text for s in doc.sections for line in s.lines if line.is_quote]
161
+ # ['Amicus certus in re incerta cernitur,', ...]
162
+ ```
163
+
164
+ Non-quote lines have `line.is_quote = False`. Quotation markup round-trips through `write()`.
165
+ See `examples/cicero-de-amicitia.txtd` (Cicero quoting Ennius and Terence) and
166
+ `examples/augustine-civ-dei-1.2.txtd` (Augustine quoting Virgil).
167
+
168
+ ### Metadata
169
+
170
+ | Field | Description |
171
+ |-------|-------------|
172
+ | `work` | Work title (**required**) |
173
+ | `author` | Author name |
174
+ | `source` | Source URL or reference |
175
+ | `scope` | Portion of work in file (e.g., `1-6` for books 1-6) |
176
+
177
+ Additional fields are preserved in `metadata.extras`.
178
+
179
+ ## API Reference
180
+
181
+ ### Functions
182
+
183
+ - `parse(path_or_content: str, *, strict: bool = True) -> Document` — Parse a `.txtd` file or string. Strict by default: raises `ValueError` if the front matter block or `work` field is missing; pass `strict=False` for fragments.
184
+ - `write(doc: Document, path: str | None) -> str` — Write to file if path given; always returns serialized string
185
+
186
+ ### Classes
187
+
188
+ - `Document` — Container with `metadata: Metadata` and `sections: list[Section]`
189
+ - `Section` — Container with `id: str`, `lines: list[Line]`, optional `title` and `metadata`
190
+ - `Line` — Container with `text: str`, `number: int`, optional `speaker: str | None` and `label: str | None`, and `is_quote: bool` (cross-source quotation)
191
+ - `Metadata` — Container with `author`, `work`, `source`, `scope`, and `extras` dict
192
+
193
+ ## Development
194
+
195
+ ```bash
196
+ # Clone and install dev dependencies
197
+ git clone https://github.com/diyclassics/txtdown.git
198
+ cd txtdown
199
+ pip install -e ".[dev]"
200
+
201
+ # Run tests
202
+ pytest tests/ -v
203
+
204
+ # Run with coverage
205
+ pytest tests/ --cov=txtdown --cov-report=term-missing
206
+ ```
207
+
208
+ ## Project History
209
+
210
+ The idea for txtdown originated in January 2018, inspired by the need for a document format for Latin text collections that balanced the simplicity of plaintext with the more involved markup of XML-based formats like TEI. The goal was to create a format that is both human-readable and computer-tractable, supporting hierarchical structures, fundamental annotations, and embedded metadata. Txtdown has since been influenced by ongoing work on annotation projects such as the [Representing Women Authorship in the Latin Treebanks (RWALT)](https://diyclassics.github.io/rwalt-site/) project.
211
+
212
+ ## License
213
+
214
+ MIT
@@ -0,0 +1,9 @@
1
+ txtdown/__init__.py,sha256=Y8g1ClbPS1UFgrV612MATjF_WheoKSjw4tVX1u2apJI,628
2
+ txtdown/models.py,sha256=IiroQFzLjEQ5XCdNBS34ECfPPrZevYKfOVZ_XQlIXJs,5552
3
+ txtdown/parser.py,sha256=SFc7tKC73-6VN0Y6E1qWuAJcXS2iBlQAf2Dz0OUF3RI,12132
4
+ txtdown/writer.py,sha256=6RaFTWSa8i7DdDbCkM8jYf7cLChrxHNNdBES2ZlOqHw,3124
5
+ txtdown-0.2.0.dist-info/licenses/LICENSE,sha256=Fh2wAEotBNBqY258xRJU-fCwlxsVQwvxpT2V1uPKwfs,1078
6
+ txtdown-0.2.0.dist-info/METADATA,sha256=gBqwqVYHY7SVyV2gYYUwbqJJCRSSBjUdZgemvLbTHYI,7319
7
+ txtdown-0.2.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
8
+ txtdown-0.2.0.dist-info/top_level.txt,sha256=e7Jb95B7fl_8SC-WvP20XR5iQnNGgFy1_AZ9LkbHwBA,8
9
+ txtdown-0.2.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2018-2026 Patrick J. Burns
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ txtdown