gedcom-x 0.5.1__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gedcomx/Gedcom5x.py ADDED
@@ -0,0 +1,558 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import html
5
+ import os
6
+ from typing import List, Optional, Tuple
7
+ import re
8
+ from collections import defaultdict
9
+ from typing import Iterable, Iterator, List, Optional, Tuple, Union
10
+
11
+ BOM = '\ufeff'
12
+
13
+ GEDCOM7_LINE_RE = re.compile(
14
+ r"""^
15
+ (?P<level>\d+) # Level
16
+ (?:\s+@(?P<xref>[^@]+)@)? # Optional record identifier
17
+ \s+(?P<tag>[A-Z0-9_-]+) # Tag
18
+ (?:\s+(?P<value>.+))? # Optional value (may be XREF)
19
+ $""",
20
+ re.VERBOSE
21
+ )
22
+
23
+ XREF_RE = re.compile(r'^@[^@]+@$')
24
+
25
+ # Add hash table for XREF of Zero Recrods?
26
+
27
+ nonzero = '[1-9]'
28
+ level = f'(?P<level>0|{nonzero}[0-9]*)'
29
+ atsign = '@'
30
+ underscore = '_'
31
+ ucletter = '[A-Z]'
32
+ tagchar = f'({ucletter}|[0-9]|{underscore})'
33
+ xref = f'{atsign}({tagchar})+{atsign}'
34
+ d = '\\ '
35
+ stdtag = f'{ucletter}({tagchar})*'
36
+ exttag = f'{underscore}({tagchar})+'
37
+ tag = f'({stdtag}|{exttag})'
38
+ voidptr = '@VOID@'
39
+ pointer = f'(?P<pointer>{voidptr}|{xref})'
40
+ nonat = '[\t -?A-\\U0010ffff]'
41
+ noneol = '[\t -\\U0010ffff]'
42
+ linestr = f'(?P<linestr>({nonat}|{atsign}{atsign})({noneol})*)'
43
+ lineval = f'({pointer}|{linestr})'
44
+ eol = '(\\\r(\\\n)?|\\\n)'
45
+ line = f'{level}{d}((?P<xref>{xref}){d})?(?P<tag>{tag})({d}{lineval})?{eol}'
46
+
47
+ from typing import List, Optional, Iterator, Union
48
+
49
+
50
+ class GedcomRecord():
51
+ def __init__(
52
+ self,
53
+ line_num: Optional[int] = None,
54
+ level: int = -1,
55
+ tag: str = "NONR",
56
+ xref: Optional[str] = None,
57
+ value: Optional[str] = None,
58
+ ) -> None:
59
+ self.line = line_num
60
+ self._subRecords: List[GedcomRecord] = []
61
+ self.level = int(level)
62
+ self.xref = xref
63
+ self.pointer: bool = False
64
+ self.tag = str(tag).strip()
65
+ self.value = value
66
+
67
+ self.parent: Optional[GedcomRecord] = None
68
+ self.root: Optional[GedcomRecord] = None
69
+
70
+ # ───────────────────────────────
71
+ # Dict/JSON friendly view
72
+ # ───────────────────────────────
73
+ @property
74
+ def _as_dict_(self):
75
+ return {
76
+ "level": self.level,
77
+ "xref": self.xref,
78
+ "tag": self.tag,
79
+ "pointer": self.pointer,
80
+ "value": self.value,
81
+ "subrecords": [sub._as_dict_ for sub in self._subRecords],
82
+ }
83
+
84
+ # ───────────────────────────────
85
+ # Subrecord management
86
+ # ───────────────────────────────
87
+ def addSubRecord(self, record: "GedcomRecord"):
88
+
89
+ if record is not None and (record.level == (self.level + 1)):
90
+ record.parent = self
91
+ self._subRecords.append(record)
92
+ else:
93
+ raise ValueError(
94
+ f"SubRecord must be next level from this record (level:{self.level}, subRecord has level {record.level})"
95
+ )
96
+
97
+ def recordOnly(self):
98
+ return GedcomRecord(
99
+ line_num=self.line, level=self.level, tag=self.tag, value=self.value
100
+ )
101
+
102
+ # ───────────────────────────────
103
+ # Pretty printers
104
+ # ───────────────────────────────
105
+ def dump(self) -> str:
106
+ record_dump = (
107
+ f"Level: {self.level}, tag: {self.tag}, value: {self.value}, "
108
+ f"subRecords: {len(self._subRecords)}\n"
109
+ )
110
+ for record in self._subRecords:
111
+ record_dump += "\t" + record.dump()
112
+ return record_dump
113
+
114
+ def describe(self, subRecords: bool = False) -> str:
115
+ level_str = "\t" * self.level
116
+ description = (
117
+ f"Line {self.line}: {level_str} Level: {self.level}, "
118
+ f"tag: '{self.tag}', xref={self.xref} value: '{self.value}', "
119
+ f"subRecords: {len(self._subRecords)}"
120
+ )
121
+ if subRecords:
122
+ for subRecord in self.subRecords():
123
+ description += "\n" + subRecord.describe(subRecords=True)
124
+ return description
125
+
126
+ # ───────────────────────────────
127
+ # Subrecord access
128
+ # ───────────────────────────────
129
+ def subRecord(self, tag: str):
130
+ result = [r for r in self._subRecords if r.tag == tag]
131
+ return None if not result else result
132
+
133
+ def subRecords(self, tag: str = None):
134
+ if not tag:
135
+ return self._subRecords
136
+ tags = tag.split("/", 1)
137
+
138
+ # Collect matching first-level subrecords
139
+ matches = [r for r in self._subRecords if r.tag == tags[0]]
140
+ if not matches:
141
+ return None
142
+
143
+ if len(tags) == 1:
144
+ return matches
145
+
146
+ # Recurse deeper
147
+ results = []
148
+ for r in matches:
149
+ sub_result = r.subRecords(tags[1])
150
+ if sub_result:
151
+ if isinstance(sub_result, list):
152
+ results.extend(sub_result)
153
+ else:
154
+ results.append(sub_result)
155
+ return results if results else None
156
+
157
+ # ───────────────────────────────
158
+ # Iteration / Subscriptability
159
+ # ───────────────────────────────
160
+ def __call__(self) -> str:
161
+ return self.describe()
162
+
163
+ def __iter__(self) -> Iterator["GedcomRecord"]:
164
+ """Iterates recursively over self and all subrecords."""
165
+ yield from self._flatten_subrecords(self)
166
+
167
+ def _flatten_subrecords(self, record: "GedcomRecord") -> Iterator["GedcomRecord"]:
168
+ yield record
169
+ for sub in record._subRecords:
170
+ yield from self._flatten_subrecords(sub)
171
+
172
+ def __len__(self) -> int:
173
+ return len(self._subRecords)
174
+
175
+ def __getitem__(self, key: Union[int, slice, str]) -> Union["GedcomRecord", List["GedcomRecord"]]:
176
+ """
177
+ - rec[0] -> first subrecord
178
+ - rec[1:3] -> slice of subrecords
179
+ - rec['NAME'] -> list of subrecords with tag 'NAME'
180
+ """
181
+ if isinstance(key, int) or isinstance(key, slice):
182
+ return self._subRecords[key]
183
+ if isinstance(key, str):
184
+ matches = [r for r in self._subRecords if r.tag == key]
185
+ if not matches:
186
+ raise KeyError(f"No subrecords with tag '{key}'.")
187
+ return matches[0] if len(matches) == 1 else matches
188
+ raise TypeError(f"Unsupported key type: {type(key).__name__}")
189
+
190
+ def __contains__(self, key: object) -> bool:
191
+ if isinstance(key, str):
192
+ return any(r.tag == key for r in self._subRecords)
193
+ if isinstance(key, int):
194
+ return 0 <= key < len(self._subRecords)
195
+ return False
196
+
197
+
198
+ TagKey = str
199
+ IndexKey = int
200
+ Key = Union[IndexKey, slice, TagKey]
201
+
202
+ class Gedcom5x():
203
+ """
204
+ Object representing a Genealogy in legacy GEDCOM 5.x / 7 format.
205
+
206
+ Parameters
207
+ ----------
208
+ records : List[GedcomReord]
209
+ List of GedcomRecords to initialize the genealogy with
210
+ filepath : str
211
+ path to a GEDCOM (``*``.ged), if provided object will read, parse and initialize with records in the file.
212
+
213
+ Note
214
+ ----
215
+ **file_path** takes precidence over **records**.
216
+ If no arguments are provided, Gedcom Object will initialize with no records.
217
+
218
+ """
219
+ _top_level_tags = ['INDI', 'FAM', 'OBJE', 'SOUR', 'REPO', 'NOTE', 'HEAD','SNOTE']
220
+
221
+ def __init__(self, records: Optional[List[GedcomRecord]] = None,filepath: str = None) -> None:
222
+ if filepath:
223
+ self.records = self._records_from_file(filepath)
224
+ elif records:
225
+ self.records: List[GedcomRecord] = records if records else []
226
+
227
+ # Fast tag index: {'HEAD': [rec], 'INDI': [rec1, rec2, ...], ...}
228
+ self._tag_index: dict[str, List[GedcomRecord]] = defaultdict(list)
229
+ self._reindex()
230
+
231
+ self.header: GedcomRecord | None = None
232
+ self._sources: List[GedcomRecord] = []
233
+ self._repositories: List[GedcomRecord] = []
234
+ self._individuals: List[GedcomRecord] = []
235
+ self._families: List[GedcomRecord] = []
236
+ self._objects: List[GedcomRecord] = []
237
+ self._snotes: List[GedcomRecord] = []
238
+ self.version = None
239
+
240
+ if self.records:
241
+ for record in self.records:
242
+ if record.tag == 'HEAD':
243
+ self.header = record
244
+ self.version = record['GEDC']['VERS'].value
245
+ if record.tag == 'INDI':
246
+ self._individuals.append(record)
247
+ if record.tag == 'SOUR' and record.level == 0:
248
+ self._sources.append(record)
249
+ if record.tag == 'REPO' and record.level == 0:
250
+ self._repositories.append(record)
251
+ if record.tag == 'FAM' and record.level == 0:
252
+ self._families.append(record)
253
+ if record.tag == 'OBJE' and record.level == 0:
254
+ self._objects.append(record)
255
+ if record.tag == 'SNOTE' and record.level == 0:
256
+ record.xref = record.value
257
+ self._snotes.append(record)
258
+
259
+ # ─────────────────────────────────────────────────────────────
260
+ # Subscriptable & iterable behavior
261
+ # ─────────────────────────────────────────────────────────────
262
+ def _reindex(self) -> None:
263
+ """Rebuild the tag index from self.records."""
264
+ self._tag_index.clear()
265
+ for rec in self.records:
266
+ # Normalize tag just in case
267
+ tag = rec.tag if isinstance(rec.tag, str) else str(rec.tag)
268
+ self._tag_index[tag].append(rec)
269
+
270
+ def __len__(self) -> int:
271
+ return len(self.records)
272
+
273
+ def __iter__(self) -> Iterator['GedcomRecord']:
274
+ # Enables: for x in gedcom:
275
+ return iter(self.records)
276
+
277
+ def __contains__(self, key: object) -> bool:
278
+ # Enables: 'HEAD' in gedcom (tag membership)
279
+ if isinstance(key, str):
280
+ return key in self._tag_index and len(self._tag_index[key]) > 0
281
+ if isinstance(key, int):
282
+ return 0 <= key < len(self.records)
283
+ return False
284
+
285
+ def __getitem__(self, key: Key) -> Union['GedcomRecord', List['GedcomRecord']]:
286
+ """
287
+ - gedcom[0] -> GedcomRecord at index 0
288
+ - gedcom[1:5] -> list of GedcomRecord (slice)
289
+ - gedcom['HEAD'] -> single record if exactly one; otherwise list of matching records
290
+ - gedcom['INDI'] -> list of all INDI records (usually many)
291
+ """
292
+ if isinstance(key, int):
293
+ return self.records[key]
294
+ if isinstance(key, slice):
295
+ return self.records[key]
296
+ if isinstance(key, str):
297
+ matches = self._tag_index.get(key, [])
298
+ if not matches:
299
+ raise KeyError(f"No records with tag '{key}'.")
300
+ # If exactly one match (e.g., HEAD), return the record; otherwise return list
301
+ return matches[0] if len(matches) == 1 else matches
302
+ raise TypeError(f"Unsupported key type: {type(key).__name__}")
303
+
304
+ # Optional: convenience helpers
305
+ def by_tag(self, tag: str) -> List['GedcomRecord']:
306
+ """Always return a list of records for a tag (empty list if none)."""
307
+ return list(self._tag_index.get(tag, []))
308
+
309
+ def first(self, tag: str) -> Optional['GedcomRecord']:
310
+ """Return the first record with a given tag, or None."""
311
+ lst = self._tag_index.get(tag, [])
312
+ return lst[0] if lst else None
313
+
314
+ # If you add/replace records after init, keep the index fresh:
315
+ def append(self, rec: 'GedcomRecord') -> None:
316
+ self.records.append(rec)
317
+ self._tag_index.setdefault(rec.tag, []).append(rec)
318
+
319
+ def extend(self, recs: Iterable['GedcomRecord']) -> None:
320
+ self.records.extend(recs)
321
+ for r in recs:
322
+ self._tag_index.setdefault(r.tag, []).append(r)
323
+
324
+ def insert(self, idx: int, rec: 'GedcomRecord') -> None:
325
+ self.records.insert(idx, rec)
326
+ self._tag_index.setdefault(rec.tag, []).append(rec)
327
+
328
+ def remove(self, rec: 'GedcomRecord') -> None:
329
+ self.records.remove(rec)
330
+ try:
331
+ bucket = self._tag_index.get(rec.tag)
332
+ if bucket:
333
+ bucket.remove(rec)
334
+ if not bucket:
335
+ del self._tag_index[rec.tag]
336
+ except ValueError:
337
+ pass # already out of index
338
+
339
+ def clear(self) -> None:
340
+ self.records.clear()
341
+ self._tag_index.clear()
342
+ # =========================================================
343
+ # 2. PROPERTY ACCESSORS (GETTERS & SETTERS)
344
+ # =========================================================
345
+
346
+ @property
347
+ def json(self):
348
+ import json
349
+ return json.dumps({'Individuals': [indi._as_dict_ for indi in self._individuals]},indent=4)
350
+
351
+ def stats(self):
352
+ def print_table(pairs):
353
+
354
+ # Calculate the width of the columns
355
+ name_width = max(len(name) for name, _ in pairs)
356
+ value_width = max(len(str(value)) for _, value in pairs)
357
+
358
+ # Print the header
359
+ print('GEDCOM Import Results')
360
+ header = f"{'Type'.ljust(name_width)} | {'Count'.ljust(value_width)}"
361
+ print('-' * len(header))
362
+ print(header)
363
+ print('-' * len(header))
364
+
365
+ # Print each pair in the table
366
+ for name, value in pairs:
367
+ print(f"{name.ljust(name_width)} | {str(value).ljust(value_width)}")
368
+
369
+ imports_stats = [
370
+ ('Top Level Records', len(self.records)),
371
+ ('Individuals', len(self.individuals)),
372
+ ('Family Group Records', len(self.families)),
373
+ ('Repositories', len(self.repositories)),
374
+ ('Sources', len(self.sources)),
375
+ ('Objects', len(self.objects))
376
+ ]
377
+
378
+ print_table(imports_stats)
379
+
380
+ @property
381
+ def sources(self) -> List[GedcomRecord]:
382
+ return self._sources
383
+
384
+ @sources.setter
385
+ def sources(self, value: List[GedcomRecord]):
386
+ if not isinstance(value, list) or not all(isinstance(item, GedcomRecord) for item in value):
387
+ raise ValueError("sources must be a list of GedcomRecord objects.")
388
+ self._sources = value
389
+
390
+ @property
391
+ def repositories(self) -> List[GedcomRecord]:
392
+ """
393
+ List of **REPO** records found in the Genealogy
394
+ """
395
+ return self._repositories
396
+
397
+ @repositories.setter
398
+ def repositories(self, value: List[GedcomRecord]):
399
+ if not isinstance(value, list) or not all(isinstance(item, GedcomRecord) for item in value):
400
+ raise ValueError("repositories must be a list of GedcomRecord objects.")
401
+ self._repositories = value
402
+
403
+ @property
404
+ def individuals(self) -> List[GedcomRecord]:
405
+ return self._individuals
406
+
407
+ @individuals.setter
408
+ def individuals(self, value: List[GedcomRecord]):
409
+ if not isinstance(value, list) or not all(isinstance(item, GedcomRecord) for item in value):
410
+ raise ValueError("individuals must be a list of GedcomRecord objects.")
411
+ self._individuals = value
412
+
413
+ @property
414
+ def families(self) -> List[GedcomRecord]:
415
+ return self._families
416
+
417
+ @families.setter
418
+ def families(self, value: List[GedcomRecord]):
419
+ if not isinstance(value, list) or not all(isinstance(item, GedcomRecord) for item in value):
420
+ raise ValueError("families must be a list of GedcomRecord objects.")
421
+ self._families = value
422
+
423
+ @property
424
+ def objects(self) -> List[GedcomRecord]:
425
+ return self._objects
426
+
427
+ @objects.setter
428
+ def objects(self, value: List[GedcomRecord]):
429
+ if not isinstance(value, list) or not all(isinstance(item, GedcomRecord) for item in value):
430
+ raise ValueError("objects must be a list of GedcomRecord objects.")
431
+ self._objects = value
432
+
433
+
434
+
435
+ def write(self) -> bool:
436
+ """
437
+ Method placeholder for writing GEDCOM files.
438
+
439
+ Raises
440
+ ------
441
+ NotImplementedError
442
+ writing to legacy GEDCOM file is not currently implimented.
443
+ """
444
+ raise NotImplementedError("Writing of GEDCOM files is not implemented.")
445
+
446
+ @staticmethod
447
+ def _records_from_file(filepath: str) -> List[GedcomRecord]:
448
+ def parse_gedcom7_line(line: str) -> Optional[Tuple[int, Optional[str], str, Optional[str], Optional[str]]]:
449
+ """
450
+ Parse a GEDCOM 7 line into: level, xref_id (record), tag, value, xref_value (if value is an @X@)
451
+
452
+ Returns:
453
+ (level, xref_id, tag, value, xref_value)
454
+ """
455
+ match = GEDCOM7_LINE_RE.match(line.strip())
456
+ if not match:
457
+ return None
458
+
459
+ level = int(match.group("level"))
460
+ xref_id = match.group("xref")
461
+ tag = match.group("tag")
462
+ value = match.group("value")
463
+ if value == 'None': value = None
464
+ xref_value = value.strip("@") if value and XREF_RE.match(value.strip()) else None
465
+
466
+ return level, xref_id, tag, value, xref_value
467
+ extension = '.ged'
468
+
469
+ if not os.path.exists(filepath):
470
+ print(f"File does not exist: {filepath}")
471
+ raise FileNotFoundError
472
+ elif not filepath.lower().endswith(extension.lower()):
473
+ print(f"File does not have the correct extension: {filepath}")
474
+ raise Exception("File does not appear to be a GEDCOM")
475
+
476
+ print("Reading from GEDCOM file")
477
+ with open(filepath, 'r', encoding='utf-8') as file:
478
+ lines = [line.strip() for line in file]
479
+
480
+ records = []
481
+ record_map = {0: None, 1: None, 2: None, 3: None, 4: None, 5: None}
482
+
483
+ for l, line in enumerate(lines):
484
+ if line.startswith(BOM):
485
+ line = line.lstrip(BOM)
486
+ line = html.unescape(line).replace('&quot;', '')
487
+
488
+ if line.strip() == '':
489
+ continue
490
+
491
+ level, tag, value = '', '', ''
492
+
493
+ # Split the line into the first two columns and the rest
494
+ parts = line.split(maxsplit=2)
495
+ if len(parts) == 3:
496
+ level, col2, col3 = parts
497
+
498
+ if col3 in Gedcom5x._top_level_tags:
499
+ tag = col3
500
+ value = col2
501
+ else:
502
+ tag = col2
503
+ value = col3
504
+
505
+ else:
506
+ level, tag = parts
507
+
508
+ level, xref, tag, value, xref_value = parse_gedcom7_line(line)
509
+
510
+ if xref is None and xref_value is not None:
511
+ xref = xref_value
512
+ # print(l, level, xref, tag, value, xref_value)
513
+
514
+ level = int(level)
515
+
516
+ new_record = GedcomRecord(line_num=l + 1, level=level, tag=tag, xref=xref,value=value)
517
+
518
+
519
+ if level == 0:
520
+ records.append(new_record)
521
+ else:
522
+ new_record.root = record_map[0]
523
+ new_record.parent = record_map[int(level) - 1]
524
+ record_map[int(level) - 1].addSubRecord(new_record)
525
+ record_map[int(level)] = new_record
526
+
527
+
528
+ return records if records else None
529
+
530
+ @staticmethod
531
+ def fromFile(filepath: str) -> 'Gedcom':
532
+ """
533
+ Static method to create a Gedcom object from a GEDCOM file.
534
+
535
+ Args:
536
+ filepath (str): The path to the GEDCOM file.
537
+
538
+ Returns:
539
+ Gedcom: An instance of the Gedcom class.
540
+ """
541
+ records = Gedcom._records_from_file(filepath)
542
+
543
+ gedcom = Gedcom(records=records)
544
+
545
+ return gedcom
546
+
547
+ def merge_with_file(self, file_path: str) -> bool:
548
+ """
549
+ Adds records from a valid (``*``.ged) file to the current Genealogy
550
+
551
+ Args:
552
+ filepath (str): The path to the GEDCOM file.
553
+
554
+ Returns:
555
+ bool: Indicates if merge was successful.
556
+ """
557
+ return True
558
+