gedcom-x 0.5.2__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gedcomx/Gedcom5x.py ADDED
@@ -0,0 +1,579 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import html
5
+ import os
6
+ from typing import List, Optional, Tuple
7
+ import re
8
+ from collections import defaultdict
9
+ from typing import Iterable, Iterator, List, Optional, Tuple, Union
10
+
11
+ from .LoggingHub import LoggingHub, ChannelConfig
12
+ hub = LoggingHub("GEDCOM5x")
13
+ hub.init_root()
14
+
15
+ BOM = '\ufeff'
16
+
17
+ GEDCOM7_LINE_RE = re.compile(
18
+ r"""^
19
+ (?P<level>\d+) # Level
20
+ (?:\s+@(?P<xref>[^@]+)@)? # Optional record identifier
21
+ \s+(?P<tag>[A-Z0-9_-]+) # Tag
22
+ (?:\s+(?P<value>.+))? # Optional value (may be XREF)
23
+ $""",
24
+ re.VERBOSE
25
+ )
26
+
27
+ XREF_RE = re.compile(r'^@[^@]+@$')
28
+
29
+ # Add hash table for XREF of Zero Recrods?
30
+
31
+ nonzero = '[1-9]'
32
+ level = f'(?P<level>0|{nonzero}[0-9]*)'
33
+ atsign = '@'
34
+ underscore = '_'
35
+ ucletter = '[A-Z]'
36
+ tagchar = f'({ucletter}|[0-9]|{underscore})'
37
+ xref = f'{atsign}({tagchar})+{atsign}'
38
+ d = '\\ '
39
+ stdtag = f'{ucletter}({tagchar})*'
40
+ exttag = f'{underscore}({tagchar})+'
41
+ tag = f'({stdtag}|{exttag})'
42
+ voidptr = '@VOID@'
43
+ pointer = f'(?P<pointer>{voidptr}|{xref})'
44
+ nonat = '[\t -?A-\\U0010ffff]'
45
+ noneol = '[\t -\\U0010ffff]'
46
+ linestr = f'(?P<linestr>({nonat}|{atsign}{atsign})({noneol})*)'
47
+ lineval = f'({pointer}|{linestr})'
48
+ eol = '(\\\r(\\\n)?|\\\n)'
49
+ line = f'{level}{d}((?P<xref>{xref}){d})?(?P<tag>{tag})({d}{lineval})?{eol}'
50
+
51
+ from typing import List, Optional, Iterator, Union
52
+
53
+
54
+ class GedcomRecord():
55
+ def __init__(
56
+ self,
57
+ line_num: Optional[int] = None,
58
+ level: int = -1,
59
+ tag: str = "NONR",
60
+ xref: Optional[str] = None,
61
+ value: Optional[str] = None,
62
+ ) -> None:
63
+ self.line = line_num
64
+ self._subRecords: List[GedcomRecord] = []
65
+ self.level = int(level)
66
+ self.xref = xref
67
+ self.pointer: bool = False
68
+ self.tag = str(tag).strip()
69
+ self.value = value
70
+
71
+ self.parent: Optional[GedcomRecord] = None
72
+ self.root: Optional[GedcomRecord] = None
73
+
74
+ # ───────────────────────────────
75
+ # Dict/JSON friendly view
76
+ # ───────────────────────────────
77
+ @property
78
+ def _as_dict_(self):
79
+ return {
80
+ "level": self.level,
81
+ "xref": self.xref,
82
+ "tag": self.tag,
83
+ "pointer": self.pointer,
84
+ "value": self.value,
85
+ "subrecords": [sub._as_dict_ for sub in self._subRecords],
86
+ }
87
+
88
+ # ───────────────────────────────
89
+ # Subrecord management
90
+ # ───────────────────────────────
91
+ def addSubRecord(self, record: "GedcomRecord"):
92
+
93
+ if record is not None and (record.level == (self.level + 1)):
94
+ record.parent = self
95
+ self._subRecords.append(record)
96
+ else:
97
+ raise ValueError(
98
+ f"SubRecord must be next level from this record (level:{self.level}, subRecord has level {record.level})"
99
+ )
100
+
101
+ def recordOnly(self):
102
+ return GedcomRecord(
103
+ line_num=self.line, level=self.level, tag=self.tag, value=self.value
104
+ )
105
+
106
+ # ───────────────────────────────
107
+ # Pretty printers
108
+ # ───────────────────────────────
109
+ def dump(self) -> str:
110
+ record_dump = (
111
+ f"Level: {self.level}, tag: {self.tag}, value: {self.value}, "
112
+ f"subRecords: {len(self._subRecords)}\n"
113
+ )
114
+ for record in self._subRecords:
115
+ record_dump += "\t" + record.dump()
116
+ return record_dump
117
+
118
+ def describe(self, subRecords: bool = False) -> str:
119
+ level_str = "\t" * self.level
120
+ description = (
121
+ f"Line {self.line}: {level_str} Level: {self.level}, "
122
+ f"tag: '{self.tag}', xref={self.xref} value: '{self.value}', "
123
+ f"subRecords: {len(self._subRecords)}"
124
+ )
125
+ if subRecords:
126
+ for subRecord in self.subRecords():
127
+ description += "\n" + subRecord.describe(subRecords=True)
128
+ return description
129
+
130
+ # ───────────────────────────────
131
+ # Subrecord access
132
+ # ───────────────────────────────
133
+ def subRecord(self, tag: str):
134
+ result = [r for r in self._subRecords if r.tag == tag]
135
+ return None if not result else result
136
+
137
+ def subRecords(self, tag: str = None):
138
+ if not tag:
139
+ return self._subRecords
140
+ tags = tag.split("/", 1)
141
+
142
+ # Collect matching first-level subrecords
143
+ matches = [r for r in self._subRecords if r.tag == tags[0]]
144
+ if not matches:
145
+ return None
146
+
147
+ if len(tags) == 1:
148
+ return matches
149
+
150
+ # Recurse deeper
151
+ results = []
152
+ for r in matches:
153
+ sub_result = r.subRecords(tags[1])
154
+ if sub_result:
155
+ if isinstance(sub_result, list):
156
+ results.extend(sub_result)
157
+ else:
158
+ results.append(sub_result)
159
+ return results if results else None
160
+
161
+ # ───────────────────────────────
162
+ # Iteration / Subscriptability
163
+ # ───────────────────────────────
164
+ def __call__(self) -> str:
165
+ return self.describe()
166
+
167
+ def __iter__(self) -> Iterator["GedcomRecord"]:
168
+ """Iterates recursively over self and all subrecords."""
169
+ yield from self._flatten_subrecords(self)
170
+
171
+ def _flatten_subrecords(self, record: "GedcomRecord") -> Iterator["GedcomRecord"]:
172
+ yield record
173
+ for sub in record._subRecords:
174
+ yield from self._flatten_subrecords(sub)
175
+
176
+ def __len__(self) -> int:
177
+ return len(self._subRecords)
178
+
179
+ def __getitem__(self, key: Union[int, slice, str]) -> Union["GedcomRecord", List["GedcomRecord"]]:
180
+ """
181
+ - rec[0] -> first subrecord
182
+ - rec[1:3] -> slice of subrecords
183
+ - rec['NAME'] -> list of subrecords with tag 'NAME'
184
+ """
185
+ if isinstance(key, int) or isinstance(key, slice):
186
+ return self._subRecords[key]
187
+ if isinstance(key, str):
188
+ matches = [r for r in self._subRecords if r.tag == key]
189
+ if not matches:
190
+ raise KeyError(f"No subrecords with tag '{key}'.")
191
+ return matches[0] if len(matches) == 1 else matches
192
+ raise TypeError(f"Unsupported key type: {type(key).__name__}")
193
+
194
+ def __contains__(self, key: object) -> bool:
195
+ if isinstance(key, str):
196
+ return any(r.tag == key for r in self._subRecords)
197
+ if isinstance(key, int):
198
+ return 0 <= key < len(self._subRecords)
199
+ return False
200
+
201
+
202
+ TagKey = str
203
+ IndexKey = int
204
+ Key = Union[IndexKey, slice, TagKey]
205
+
206
+ class Gedcom5x():
207
+ """
208
+ Object representing a Genealogy in legacy GEDCOM 5.x / 7 format.
209
+
210
+ Parameters
211
+ ----------
212
+ records : List[GedcomReord]
213
+ List of GedcomRecords to initialize the genealogy with
214
+ filepath : str
215
+ path to a GEDCOM (``*``.ged), if provided object will read, parse and initialize with records in the file.
216
+
217
+ Note
218
+ ----
219
+ **file_path** takes precidence over **records**.
220
+ If no arguments are provided, Gedcom Object will initialize with no records.
221
+
222
+ """
223
+ _top_level_tags = ['INDI', 'FAM', 'OBJE', 'SOUR', 'REPO', 'NOTE', 'HEAD','SNOTE']
224
+
225
+ def __init__(self, records: Optional[List[GedcomRecord]] = None,filepath: str = None) -> None:
226
+
227
+ self.records: List[GedcomRecord] = records or []
228
+ if filepath:
229
+ self.records = self._records_from_file(filepath)
230
+ elif records:
231
+ self.records: List[GedcomRecord] = records if records else []
232
+
233
+
234
+ # Fast tag index: {'HEAD': [rec], 'INDI': [rec1, rec2, ...], ...}
235
+ self._tag_index: dict[str, List[GedcomRecord]] = defaultdict(list)
236
+ self._reindex()
237
+
238
+ self.header: GedcomRecord | None = None
239
+ self._sources: List[GedcomRecord] = []
240
+ self._repositories: List[GedcomRecord] = []
241
+ self._individuals: List[GedcomRecord] = []
242
+ self._families: List[GedcomRecord] = []
243
+ self._objects: List[GedcomRecord] = []
244
+ self._snotes: List[GedcomRecord] = []
245
+ self.version = None
246
+
247
+ if self.records:
248
+ for record in self.records:
249
+ if record.tag == 'HEAD':
250
+ self.header = record
251
+ self.version = record['GEDC']['VERS'].value
252
+ if record.tag == 'INDI':
253
+ self._individuals.append(record)
254
+ if record.tag == 'SOUR' and record.level == 0:
255
+ self._sources.append(record)
256
+ if record.tag == 'REPO' and record.level == 0:
257
+ self._repositories.append(record)
258
+ if record.tag == 'FAM' and record.level == 0:
259
+ self._families.append(record)
260
+ if record.tag == 'OBJE' and record.level == 0:
261
+ self._objects.append(record)
262
+ if record.tag == 'SNOTE' and record.level == 0:
263
+ record.xref = record.value
264
+ self._snotes.append(record)
265
+
266
+ # ─────────────────────────────────────────────────────────────
267
+ # Subscriptable & iterable behavior
268
+ # ─────────────────────────────────────────────────────────────
269
+ def _reindex(self) -> None:
270
+ """Rebuild the tag index from self.records."""
271
+ self._tag_index.clear()
272
+ for rec in self.records:
273
+ # Normalize tag just in case
274
+ tag = rec.tag if isinstance(rec.tag, str) else str(rec.tag)
275
+ self._tag_index[tag].append(rec)
276
+
277
+ def __len__(self) -> int:
278
+ return len(self.records)
279
+
280
+ def __iter__(self) -> Iterator['GedcomRecord']:
281
+ # Enables: for x in gedcom:
282
+ return iter(self.records)
283
+
284
+ def __contains__(self, key: object) -> bool:
285
+ # Enables: 'HEAD' in gedcom (tag membership)
286
+ if isinstance(key, str):
287
+ return key in self._tag_index and len(self._tag_index[key]) > 0
288
+ if isinstance(key, int):
289
+ return 0 <= key < len(self.records)
290
+ return False
291
+
292
+ def __getitem__(self, key: Key) -> Union['GedcomRecord', List['GedcomRecord']]:
293
+ """
294
+ - gedcom[0] -> GedcomRecord at index 0
295
+ - gedcom[1:5] -> list of GedcomRecord (slice)
296
+ - gedcom['HEAD'] -> single record if exactly one; otherwise list of matching records
297
+ - gedcom['INDI'] -> list of all INDI records (usually many)
298
+ """
299
+ if isinstance(key, int):
300
+ return self.records[key]
301
+ if isinstance(key, slice):
302
+ return self.records[key]
303
+ if isinstance(key, str):
304
+ matches = self._tag_index.get(key, [])
305
+ if not matches:
306
+ raise KeyError(f"No records with tag '{key}'.")
307
+ # If exactly one match (e.g., HEAD), return the record; otherwise return list
308
+ return matches[0] if len(matches) == 1 else matches
309
+ raise TypeError(f"Unsupported key type: {type(key).__name__}")
310
+
311
+ # Optional: convenience helpers
312
+ def by_tag(self, tag: str) -> List['GedcomRecord']:
313
+ """Always return a list of records for a tag (empty list if none)."""
314
+ return list(self._tag_index.get(tag, []))
315
+
316
+ def first(self, tag: str) -> Optional['GedcomRecord']:
317
+ """Return the first record with a given tag, or None."""
318
+ lst = self._tag_index.get(tag, [])
319
+ return lst[0] if lst else None
320
+
321
+ # If you add/replace records after init, keep the index fresh:
322
+ def append(self, rec: 'GedcomRecord') -> None:
323
+ self.records.append(rec)
324
+ self._tag_index.setdefault(rec.tag, []).append(rec)
325
+
326
+ def extend(self, recs: Iterable['GedcomRecord']) -> None:
327
+ self.records.extend(recs)
328
+ for r in recs:
329
+ self._tag_index.setdefault(r.tag, []).append(r)
330
+
331
+ def insert(self, idx: int, rec: 'GedcomRecord') -> None:
332
+ self.records.insert(idx, rec)
333
+ self._tag_index.setdefault(rec.tag, []).append(rec)
334
+
335
+ def remove(self, rec: 'GedcomRecord') -> None:
336
+ self.records.remove(rec)
337
+ try:
338
+ bucket = self._tag_index.get(rec.tag)
339
+ if bucket:
340
+ bucket.remove(rec)
341
+ if not bucket:
342
+ del self._tag_index[rec.tag]
343
+ except ValueError:
344
+ pass # already out of index
345
+
346
+ def clear(self) -> None:
347
+ self.records.clear()
348
+ self._tag_index.clear()
349
+ # =========================================================
350
+ # 2. PROPERTY ACCESSORS (GETTERS & SETTERS)
351
+ # =========================================================
352
+
353
+ @property
354
+ def json(self):
355
+ import json
356
+ return json.dumps({'Individuals': [indi._as_dict_ for indi in self._individuals]},indent=4)
357
+
358
+ def stats(self):
359
+ def print_table(pairs):
360
+
361
+ # Calculate the width of the columns
362
+ name_width = max(len(name) for name, _ in pairs)
363
+ value_width = max(len(str(value)) for _, value in pairs)
364
+
365
+ # Print the header
366
+ print(f'GEDCOM {self.version} Import Results')
367
+ header = f"{'Type'.ljust(name_width)} | {'Count'.ljust(value_width)}"
368
+ print('-' * len(header))
369
+ print(header)
370
+ print('-' * len(header))
371
+
372
+ # Print each pair in the table
373
+ for name, value in pairs:
374
+ print(f"{name.ljust(name_width)} | {str(value).ljust(value_width)}")
375
+
376
+ imports_stats = [
377
+ ('Top Level Records', len(self.records)),
378
+ ('Individuals', len(self.individuals)),
379
+ ('Family Group Records', len(self.families)),
380
+ ('Repositories', len(self.repositories)),
381
+ ('Sources', len(self.sources)),
382
+ ('Objects', len(self.objects))
383
+ ]
384
+
385
+ print_table(imports_stats)
386
+
387
+ @property
388
+ def sources(self) -> List[GedcomRecord]:
389
+ return self._sources
390
+
391
+ @sources.setter
392
+ def sources(self, value: List[GedcomRecord]):
393
+ if not isinstance(value, list) or not all(isinstance(item, GedcomRecord) for item in value):
394
+ raise ValueError("sources must be a list of GedcomRecord objects.")
395
+ self._sources = value
396
+
397
+ @property
398
+ def repositories(self) -> List[GedcomRecord]:
399
+ """
400
+ List of **REPO** records found in the Genealogy
401
+ """
402
+ return self._repositories
403
+
404
+ @repositories.setter
405
+ def repositories(self, value: List[GedcomRecord]):
406
+ if not isinstance(value, list) or not all(isinstance(item, GedcomRecord) for item in value):
407
+ raise ValueError("repositories must be a list of GedcomRecord objects.")
408
+ self._repositories = value
409
+
410
+ @property
411
+ def individuals(self) -> List[GedcomRecord]:
412
+ return self._individuals
413
+
414
+ @individuals.setter
415
+ def individuals(self, value: List[GedcomRecord]):
416
+ if not isinstance(value, list) or not all(isinstance(item, GedcomRecord) for item in value):
417
+ raise ValueError("individuals must be a list of GedcomRecord objects.")
418
+ self._individuals = value
419
+
420
+ @property
421
+ def families(self) -> List[GedcomRecord]:
422
+ return self._families
423
+
424
+ @families.setter
425
+ def families(self, value: List[GedcomRecord]):
426
+ if not isinstance(value, list) or not all(isinstance(item, GedcomRecord) for item in value):
427
+ raise ValueError("families must be a list of GedcomRecord objects.")
428
+ self._families = value
429
+
430
+ @property
431
+ def objects(self) -> List[GedcomRecord]:
432
+ return self._objects
433
+
434
+ @objects.setter
435
+ def objects(self, value: List[GedcomRecord]):
436
+ if not isinstance(value, list) or not all(isinstance(item, GedcomRecord) for item in value):
437
+ raise ValueError("objects must be a list of GedcomRecord objects.")
438
+ self._objects = value
439
+
440
+ def write(self) -> bool:
441
+ """
442
+ Method placeholder for writing GEDCOM files.
443
+
444
+ Raises
445
+ ------
446
+ NotImplementedError
447
+ writing to legacy GEDCOM file is not currently implimented.
448
+ """
449
+ raise NotImplementedError("Writing of GEDCOM files is not implemented.")
450
+
451
+ @staticmethod
452
+ def _records_from_file(file_path: str) -> List[GedcomRecord]:
453
+ def parse_gedcom7_line(line: str) -> Optional[Tuple[int, Optional[str], str, Optional[str], Optional[str]]]:
454
+ """
455
+ Parse a GEDCOM 7 line into: level, xref_id (record), tag, value, xref_value (if value is an @X@)
456
+
457
+ Returns:
458
+ (level, xref_id, tag, value, xref_value)
459
+ """
460
+ match = GEDCOM7_LINE_RE.match(line.strip())
461
+ if not match:
462
+ return None
463
+
464
+ level = int(match.group("level"))
465
+ xref_id = match.group("xref")
466
+ tag = match.group("tag")
467
+ value = match.group("value")
468
+ if value == 'None': value = None
469
+ xref_value = value.strip("@") if value and XREF_RE.match(value.strip()) else None
470
+
471
+ return level, xref_id, tag, value, xref_value
472
+ extension = '.ged'
473
+
474
+ if not os.path.exists(file_path):
475
+ print(f"File does not exist: {file_path}")
476
+ raise FileNotFoundError
477
+ elif not file_path.lower().endswith(extension.lower()):
478
+ print(f"File does not have the correct extension: {file_path}")
479
+ raise Exception("File does not appear to be a GEDCOM")
480
+
481
+ print("Reading from GEDCOM file")
482
+ with open(file_path, 'r', encoding='utf-8') as file:
483
+ lines = [line.strip() for line in file]
484
+
485
+ records = []
486
+ record_map = {0: None, 1: None, 2: None, 3: None, 4: None, 5: None}
487
+
488
+ for l, line in enumerate(lines):
489
+ if line.startswith(BOM):
490
+ line = line.lstrip(BOM)
491
+ line = html.unescape(line).replace('&quot;', '')
492
+
493
+ if line.strip() == '':
494
+ continue
495
+
496
+ level, tag, value = '', '', ''
497
+
498
+ # Split the line into the first two columns and the rest
499
+ parts = line.split(maxsplit=2)
500
+ if len(parts) == 3:
501
+ level, col2, col3 = parts
502
+
503
+ if col3 in Gedcom5x._top_level_tags:
504
+ tag = col3
505
+ value = col2
506
+ else:
507
+ tag = col2
508
+ value = col3
509
+
510
+ else:
511
+ level, tag = parts
512
+
513
+ level, xref, tag, value, xref_value = parse_gedcom7_line(line)
514
+
515
+ if xref is None and xref_value is not None:
516
+ xref = xref_value
517
+ # print(l, level, xref, tag, value, xref_value)
518
+
519
+ level = int(level)
520
+
521
+ new_record = GedcomRecord(line_num=l + 1, level=level, tag=tag, xref=xref,value=value)
522
+
523
+
524
+ if level == 0:
525
+ records.append(new_record)
526
+ else:
527
+ new_record.root = record_map[0]
528
+ new_record.parent = record_map[int(level) - 1]
529
+ record_map[int(level) - 1].addSubRecord(new_record)
530
+ record_map[int(level)] = new_record
531
+
532
+
533
+ return records if records else None
534
+
535
+ @staticmethod
536
+ def fromFile(file_path: str) -> 'Gedcom':
537
+ """
538
+ Static method to create a Gedcom object from a GEDCOM file.
539
+
540
+ Args:
541
+ filepath (str): The path to the GEDCOM file.
542
+
543
+ Returns:
544
+ Gedcom: An instance of the Gedcom class.
545
+ """
546
+ records = Gedcom5x._records_from_file(file_path)
547
+
548
+ gedcom = Gedcom5x(records=records)
549
+
550
+ return gedcom
551
+
552
+ def load_file(self,file_path: str) -> None:
553
+ records = Gedcom5x._records_from_file(file_path)
554
+ if records:
555
+ self.records.extend(records)
556
+ for record in self.records:
557
+ if record.tag == 'HEAD':
558
+ pass
559
+ #self.header = record
560
+ #version = record['GEDC']['VERS'].value
561
+ #if not str(version)[0:2] == str(self.version)[0:2]: #TODO Deal with no VERS
562
+ # raise ValueError(f'Wrong Version Current: {str(version)[0:2]}, new file: {str(self.version)[0:2]}')
563
+ if record.tag == 'INDI':
564
+ self._individuals.append(record)
565
+ if record.tag == 'SOUR' and record.level == 0:
566
+ self._sources.append(record)
567
+ if record.tag == 'REPO' and record.level == 0:
568
+ self._repositories.append(record)
569
+ if record.tag == 'FAM' and record.level == 0:
570
+ self._families.append(record)
571
+ if record.tag == 'OBJE' and record.level == 0:
572
+ self._objects.append(record)
573
+ if record.tag == 'SNOTE' and record.level == 0:
574
+ record.xref = record.value
575
+ self._snotes.append(record)
576
+ else:
577
+ raise ValueError()
578
+
579
+