jodie 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jodie/__init__.py +5 -0
- jodie/cli/__doc__.py +61 -0
- jodie/cli/__init__.py +2 -0
- jodie/cli/__main__.py +298 -0
- jodie/cli/preview.py +65 -0
- jodie/config.py +122 -0
- jodie/constants.py +44 -0
- jodie/contact/__init__.py +5 -0
- jodie/contact/contact.py +659 -0
- jodie/input/__init__.py +5 -0
- jodie/input/clipboard.py +8 -0
- jodie/input/signature.py +57 -0
- jodie/input/stdin.py +7 -0
- jodie/parsers/__init__.py +23 -0
- jodie/parsers/base.py +69 -0
- jodie/parsers/parsers.py +243 -0
- jodie/parsers/pipeline.py +127 -0
- jodie-0.1.0.dist-info/METADATA +216 -0
- jodie-0.1.0.dist-info/RECORD +22 -0
- jodie-0.1.0.dist-info/WHEEL +5 -0
- jodie-0.1.0.dist-info/entry_points.txt +3 -0
- jodie-0.1.0.dist-info/top_level.txt +1 -0
jodie/input/signature.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
class SignaturePreprocessor:
|
|
5
|
+
"""Preprocess email signatures before parsing."""
|
|
6
|
+
|
|
7
|
+
# Noise patterns to filter out
|
|
8
|
+
NOISE_PATTERNS = [
|
|
9
|
+
r'^>.*$', # Quoted replies
|
|
10
|
+
r'^On .* wrote:$', # Forwarded headers
|
|
11
|
+
r'^-+\s*Original Message\s*-+$',
|
|
12
|
+
r'^From:.*$',
|
|
13
|
+
r'^Sent:.*$',
|
|
14
|
+
r'^To:.*$',
|
|
15
|
+
r'^Subject:.*$',
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
# Pronouns to strip from names
|
|
19
|
+
PRONOUNS = r'\s*\([^)]*(?:he|she|they|him|her|them)[^)]*\)\s*'
|
|
20
|
+
|
|
21
|
+
# Common separators
|
|
22
|
+
SEPARATORS = r'[•·|–—]'
|
|
23
|
+
|
|
24
|
+
@classmethod
|
|
25
|
+
def preprocess(cls, text: str) -> List[str]:
|
|
26
|
+
"""Clean and split signature text into parseable tokens."""
|
|
27
|
+
if not text:
|
|
28
|
+
return []
|
|
29
|
+
|
|
30
|
+
lines = []
|
|
31
|
+
for line in text.strip().split('\n'):
|
|
32
|
+
line = line.strip()
|
|
33
|
+
|
|
34
|
+
# Skip empty lines
|
|
35
|
+
if not line:
|
|
36
|
+
continue
|
|
37
|
+
|
|
38
|
+
# Skip noise patterns
|
|
39
|
+
skip = False
|
|
40
|
+
for pattern in cls.NOISE_PATTERNS:
|
|
41
|
+
if re.match(pattern, line, re.IGNORECASE):
|
|
42
|
+
skip = True
|
|
43
|
+
break
|
|
44
|
+
if skip:
|
|
45
|
+
continue
|
|
46
|
+
|
|
47
|
+
# Strip pronouns
|
|
48
|
+
line = re.sub(cls.PRONOUNS, '', line, flags=re.IGNORECASE)
|
|
49
|
+
|
|
50
|
+
# Split by common separators
|
|
51
|
+
parts = re.split(cls.SEPARATORS, line)
|
|
52
|
+
for part in parts:
|
|
53
|
+
part = part.strip()
|
|
54
|
+
if part:
|
|
55
|
+
lines.append(part)
|
|
56
|
+
|
|
57
|
+
return lines
|
jodie/input/stdin.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# jodie/parsers/__init__.py
|
|
3
|
+
from jodie.parsers.parsers import (
|
|
4
|
+
BaseParser,
|
|
5
|
+
EmailParser,
|
|
6
|
+
NameParser,
|
|
7
|
+
WebsiteParser,
|
|
8
|
+
TitleParser,
|
|
9
|
+
PhoneParser
|
|
10
|
+
)
|
|
11
|
+
from jodie.parsers.base import ParseResult
|
|
12
|
+
from jodie.parsers.pipeline import ParserPipeline
|
|
13
|
+
|
|
14
|
+
__all__ = (
|
|
15
|
+
"BaseParser",
|
|
16
|
+
"EmailParser",
|
|
17
|
+
"NameParser",
|
|
18
|
+
"WebsiteParser",
|
|
19
|
+
"TitleParser",
|
|
20
|
+
"PhoneParser",
|
|
21
|
+
"ParseResult",
|
|
22
|
+
"ParserPipeline"
|
|
23
|
+
)
|
jodie/parsers/base.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# jodie/parsers/base.py
|
|
3
|
+
"""Base classes for parser plugin architecture."""
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import Optional, Any
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class ParseResult:
|
|
12
|
+
"""Result from a parser."""
|
|
13
|
+
value: Any
|
|
14
|
+
confidence: float # 0.0 to 1.0
|
|
15
|
+
consumed_text: str
|
|
16
|
+
source: str = "parsed" # "parsed", "inferred", "explicit"
|
|
17
|
+
|
|
18
|
+
def __repr__(self) -> str:
|
|
19
|
+
"""Developer-friendly representation showing value and confidence."""
|
|
20
|
+
conf_pct = f"{self.confidence:.0%}"
|
|
21
|
+
return f"ParseResult({self.value!r}, {conf_pct}, source={self.source!r})"
|
|
22
|
+
|
|
23
|
+
def __str__(self) -> str:
|
|
24
|
+
"""User-friendly representation showing value with confidence indicator."""
|
|
25
|
+
conf_pct = f"{self.confidence:.0%}"
|
|
26
|
+
return f"{self.value} ({conf_pct})"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class BaseParser(ABC):
|
|
30
|
+
"""Abstract base class for all parsers.
|
|
31
|
+
|
|
32
|
+
To create a new parser:
|
|
33
|
+
1. Create a new file in jodie/parsers/
|
|
34
|
+
2. Subclass BaseParser
|
|
35
|
+
3. Set name, priority, and field_name class attributes
|
|
36
|
+
4. Implement the parse() classmethod
|
|
37
|
+
|
|
38
|
+
Parsers are auto-discovered by the Pipeline.
|
|
39
|
+
"""
|
|
40
|
+
name: str = ""
|
|
41
|
+
priority: int = 0 # Higher = runs first
|
|
42
|
+
field_name: str = ""
|
|
43
|
+
|
|
44
|
+
@classmethod
|
|
45
|
+
@abstractmethod
|
|
46
|
+
def parse(cls, text: str) -> Optional[ParseResult]:
|
|
47
|
+
"""Parse text and return result or None if no match.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
text: Input text to parse
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
ParseResult if match found, None otherwise
|
|
54
|
+
"""
|
|
55
|
+
pass
|
|
56
|
+
|
|
57
|
+
@staticmethod
|
|
58
|
+
def find_matches(pattern: str, text: str) -> list:
|
|
59
|
+
"""Find all regex matches in text.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
pattern: Regex pattern
|
|
63
|
+
text: Text to search
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
List of matches
|
|
67
|
+
"""
|
|
68
|
+
import re
|
|
69
|
+
return re.findall(pattern, text)
|
jodie/parsers/parsers.py
ADDED
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# jodie/parsers/parsers.py
|
|
3
|
+
import re
|
|
4
|
+
from nameparser import HumanName
|
|
5
|
+
|
|
6
|
+
class BaseParser:
|
|
7
|
+
"""
|
|
8
|
+
Base class for parsing text into specific contact properties.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
def __init__(self):
|
|
12
|
+
pass
|
|
13
|
+
|
|
14
|
+
@staticmethod
|
|
15
|
+
def find_matches(pattern, text):
|
|
16
|
+
"""
|
|
17
|
+
Find all matches of a regex pattern in the given text.
|
|
18
|
+
|
|
19
|
+
:param pattern: Regex pattern to match against the text.
|
|
20
|
+
:param text: The text to search in.
|
|
21
|
+
:return: A list of all matches.
|
|
22
|
+
"""
|
|
23
|
+
return re.findall(pattern, text)
|
|
24
|
+
|
|
25
|
+
@classmethod
|
|
26
|
+
def parse(cls, text):
|
|
27
|
+
"""
|
|
28
|
+
Parse the given text to extract information.
|
|
29
|
+
This method should be implemented by subclasses.
|
|
30
|
+
|
|
31
|
+
:param text: The text to parse.
|
|
32
|
+
:return: Extracted information.
|
|
33
|
+
"""
|
|
34
|
+
raise NotImplementedError("Subclasses must implement this method.")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class EmailParser(BaseParser):
|
|
38
|
+
@classmethod
|
|
39
|
+
def parse(cls, text):
|
|
40
|
+
"""
|
|
41
|
+
Extracts a single email from the text.
|
|
42
|
+
Returns the first valid match or None.
|
|
43
|
+
"""
|
|
44
|
+
email_pattern = r'<(\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b)>' \
|
|
45
|
+
r'|(\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b)'
|
|
46
|
+
matches = cls.find_matches(email_pattern, text)
|
|
47
|
+
for match in matches:
|
|
48
|
+
# Return the first non-empty match
|
|
49
|
+
if isinstance(match, tuple):
|
|
50
|
+
return next((item for item in match if item), None)
|
|
51
|
+
return match
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class TitleParser(BaseParser):
|
|
56
|
+
# Common job titles and their variations
|
|
57
|
+
COMMON_TITLES = {
|
|
58
|
+
# C-level
|
|
59
|
+
"ceo", "cto", "cfo", "coo", "chief executive officer", "chief technology officer",
|
|
60
|
+
"chief financial officer", "chief operating officer",
|
|
61
|
+
|
|
62
|
+
# Founder/Leadership
|
|
63
|
+
"founder", "co-founder", "cofounder", "president", "vice president", "vp",
|
|
64
|
+
"director", "head", "lead", "principal",
|
|
65
|
+
|
|
66
|
+
# Engineering
|
|
67
|
+
"engineer", "developer", "architect", "scientist", "researcher",
|
|
68
|
+
"software engineer", "senior engineer", "staff engineer", "full stack engineer",
|
|
69
|
+
"senior full stack engineer", "senior software engineer",
|
|
70
|
+
|
|
71
|
+
# Management
|
|
72
|
+
"manager", "supervisor", "coordinator", "specialist", "analyst",
|
|
73
|
+
"project manager", "product manager", "program manager",
|
|
74
|
+
|
|
75
|
+
# Business
|
|
76
|
+
"consultant", "advisor", "partner", "associate", "representative",
|
|
77
|
+
"business development", "sales", "marketing",
|
|
78
|
+
|
|
79
|
+
# Academic
|
|
80
|
+
"professor", "lecturer", "researcher", "fellow", "scholar",
|
|
81
|
+
|
|
82
|
+
# Creative
|
|
83
|
+
"designer", "artist", "writer", "editor", "producer",
|
|
84
|
+
|
|
85
|
+
# Other
|
|
86
|
+
"intern", "assistant", "coordinator", "specialist"
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
# Acronyms that should stay uppercase
|
|
90
|
+
ACRONYMS = {"CEO", "CTO", "CFO", "COO", "VP"}
|
|
91
|
+
|
|
92
|
+
# Common title prefixes
|
|
93
|
+
PREFIXES = {
|
|
94
|
+
"senior", "lead", "principal", "staff", "associate", "junior",
|
|
95
|
+
"full stack", "front end", "back end", "full-stack", "front-end", "back-end"
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
@classmethod
|
|
99
|
+
def parse(cls, text):
|
|
100
|
+
"""
|
|
101
|
+
Enhanced title parsing:
|
|
102
|
+
- Standardizes capitalization for common titles
|
|
103
|
+
- Handles variations and compound titles
|
|
104
|
+
- Returns None for non-title text
|
|
105
|
+
- Preserves acronym case (e.g., CEO stays as CEO)
|
|
106
|
+
"""
|
|
107
|
+
if not isinstance(text, str) or not text.strip():
|
|
108
|
+
return None
|
|
109
|
+
|
|
110
|
+
# Normalize text for comparison
|
|
111
|
+
cleaned_text = text.strip().lower()
|
|
112
|
+
|
|
113
|
+
# Direct match against common titles
|
|
114
|
+
if cleaned_text in cls.COMMON_TITLES:
|
|
115
|
+
# Preserve original case for acronyms
|
|
116
|
+
if text.upper() in cls.ACRONYMS:
|
|
117
|
+
return text.upper()
|
|
118
|
+
# Preserve original case for the entire title
|
|
119
|
+
return text
|
|
120
|
+
|
|
121
|
+
# Check for compound titles with prefixes
|
|
122
|
+
words = cleaned_text.split()
|
|
123
|
+
if len(words) >= 2:
|
|
124
|
+
# First check if the entire phrase is a known title
|
|
125
|
+
if cleaned_text in cls.COMMON_TITLES:
|
|
126
|
+
return text
|
|
127
|
+
|
|
128
|
+
# Then check for prefix + title combinations
|
|
129
|
+
for i in range(len(words)):
|
|
130
|
+
prefix = " ".join(words[:i+1])
|
|
131
|
+
if prefix in cls.PREFIXES:
|
|
132
|
+
remaining = " ".join(words[i+1:])
|
|
133
|
+
if remaining in cls.COMMON_TITLES:
|
|
134
|
+
return text
|
|
135
|
+
|
|
136
|
+
# Finally check if any part matches a known title
|
|
137
|
+
for i in range(len(words)):
|
|
138
|
+
for j in range(i + 1, len(words) + 1):
|
|
139
|
+
phrase = " ".join(words[i:j])
|
|
140
|
+
if phrase in cls.COMMON_TITLES:
|
|
141
|
+
return text
|
|
142
|
+
|
|
143
|
+
return None
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class WebsiteParser(BaseParser):
|
|
147
|
+
@classmethod
|
|
148
|
+
def parse(cls, text):
|
|
149
|
+
# Match typical URL patterns
|
|
150
|
+
url_pattern = r'https?://[^\s]+|www\.[^\s]+'
|
|
151
|
+
urls = cls.find_matches(url_pattern, text)
|
|
152
|
+
return urls[0] if urls else None
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
class PhoneParser(BaseParser):
|
|
156
|
+
@classmethod
|
|
157
|
+
def parse(cls, text):
|
|
158
|
+
"""
|
|
159
|
+
Parse a phone number from the given text.
|
|
160
|
+
Supports various phone number formats:
|
|
161
|
+
- 10 digits: 5167763192
|
|
162
|
+
- With country code: +15167763192, 15167763192
|
|
163
|
+
- With separators: 516-776-3192, 516.776.3192, (516) 776-3192
|
|
164
|
+
- International formats: +1 516 776 3192
|
|
165
|
+
|
|
166
|
+
Returns the cleaned phone number or None if no valid phone found.
|
|
167
|
+
"""
|
|
168
|
+
if not isinstance(text, str) or not text.strip():
|
|
169
|
+
return None
|
|
170
|
+
|
|
171
|
+
# Remove common non-digit characters for analysis
|
|
172
|
+
digits_only = ''.join(ch for ch in text if ch.isdigit())
|
|
173
|
+
|
|
174
|
+
# Various phone number patterns
|
|
175
|
+
phone_patterns = [
|
|
176
|
+
# US phone numbers with various formatting
|
|
177
|
+
r'\+?1?[-.\s]?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})\b',
|
|
178
|
+
# 10 digit number
|
|
179
|
+
r'\b([0-9]{10})\b',
|
|
180
|
+
# International format with country code
|
|
181
|
+
r'\+?([0-9]{1,3})[-.\s]?([0-9]{3,4})[-.\s]?([0-9]{3,4})[-.\s]?([0-9]{3,4})\b'
|
|
182
|
+
]
|
|
183
|
+
|
|
184
|
+
for pattern in phone_patterns:
|
|
185
|
+
matches = cls.find_matches(pattern, text)
|
|
186
|
+
if matches:
|
|
187
|
+
# For tuple matches, join the groups
|
|
188
|
+
if isinstance(matches[0], tuple):
|
|
189
|
+
# Remove empty groups and join
|
|
190
|
+
phone_digits = ''.join(group for group in matches[0] if group)
|
|
191
|
+
else:
|
|
192
|
+
phone_digits = matches[0]
|
|
193
|
+
|
|
194
|
+
# Validate phone number length
|
|
195
|
+
if len(phone_digits) == 10:
|
|
196
|
+
# US phone number without country code
|
|
197
|
+
return phone_digits
|
|
198
|
+
elif len(phone_digits) == 11 and phone_digits.startswith('1'):
|
|
199
|
+
# US phone number with country code
|
|
200
|
+
return phone_digits[1:] # Return without country code
|
|
201
|
+
elif 7 <= len(phone_digits) <= 15:
|
|
202
|
+
# International phone number
|
|
203
|
+
return phone_digits
|
|
204
|
+
|
|
205
|
+
# Fallback: if we have exactly 10 or 11 digits, treat as phone
|
|
206
|
+
if len(digits_only) == 10:
|
|
207
|
+
return digits_only
|
|
208
|
+
elif len(digits_only) == 11 and digits_only.startswith('1'):
|
|
209
|
+
return digits_only[1:]
|
|
210
|
+
|
|
211
|
+
return None
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
class NameParser(BaseParser):
|
|
215
|
+
@classmethod
|
|
216
|
+
def parse(cls, text):
|
|
217
|
+
"""
|
|
218
|
+
Parse a name from the given text.
|
|
219
|
+
- If an email is present in mailbox format, extract the portion before the email.
|
|
220
|
+
- Use the `nameparser` package to parse the extracted name.
|
|
221
|
+
|
|
222
|
+
:param text: The text to parse.
|
|
223
|
+
:return: A tuple of (first_name, last_name).
|
|
224
|
+
"""
|
|
225
|
+
# Check for email in the text.
|
|
226
|
+
email = EmailParser.parse(text)
|
|
227
|
+
if email:
|
|
228
|
+
# Extract the portion of text before the email for name parsing.
|
|
229
|
+
name_portion_index = text.find(email)
|
|
230
|
+
name_portion = text[:name_portion_index].strip()
|
|
231
|
+
name_portion = name_portion.rstrip('<').strip()
|
|
232
|
+
else:
|
|
233
|
+
# Use the entire text if no email is found.
|
|
234
|
+
name_portion = text.strip()
|
|
235
|
+
|
|
236
|
+
if not name_portion:
|
|
237
|
+
return "", ""
|
|
238
|
+
|
|
239
|
+
# Use the `HumanName` class to parse the name intelligently.
|
|
240
|
+
name = HumanName(name_portion)
|
|
241
|
+
|
|
242
|
+
# Return the first and last names as a tuple.
|
|
243
|
+
return name.first, f"{name.middle} {name.last}".strip()
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# jodie/parsers/pipeline.py
|
|
3
|
+
"""Parser pipeline for orchestrating field extraction."""
|
|
4
|
+
|
|
5
|
+
from typing import List, Dict
|
|
6
|
+
from .base import ParseResult
|
|
7
|
+
from jodie.constants import WEBMAIL_DOMAINS
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ParserPipeline:
|
|
11
|
+
"""Orchestrates parsers to extract contact fields from text.
|
|
12
|
+
|
|
13
|
+
The pipeline:
|
|
14
|
+
1. Runs parsers in priority order
|
|
15
|
+
2. Stops processing a line when a parser matches
|
|
16
|
+
3. Applies cross-field inference after all parsing
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
@classmethod
|
|
20
|
+
def parse(cls, lines: List[str]) -> Dict[str, ParseResult]:
|
|
21
|
+
"""Run all parsers on input lines.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
lines: List of text lines to parse
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
Dict mapping field names to ParseResults
|
|
28
|
+
"""
|
|
29
|
+
# Import parsers here to avoid circular imports
|
|
30
|
+
from .parsers import (
|
|
31
|
+
EmailParser, PhoneParser, WebsiteParser,
|
|
32
|
+
TitleParser, NameParser
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
results: Dict[str, ParseResult] = {}
|
|
36
|
+
|
|
37
|
+
# Parser configuration: (parser_class, field_name, priority)
|
|
38
|
+
# Higher priority = runs first
|
|
39
|
+
parsers = [
|
|
40
|
+
(EmailParser, "email", 100),
|
|
41
|
+
(WebsiteParser, "websites", 90),
|
|
42
|
+
(TitleParser, "job_title", 70),
|
|
43
|
+
(PhoneParser, "phone", 60),
|
|
44
|
+
(NameParser, "name", 50),
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
# Sort by priority (highest first)
|
|
48
|
+
parsers.sort(key=lambda x: x[2], reverse=True)
|
|
49
|
+
|
|
50
|
+
for line in lines:
|
|
51
|
+
if not line or not line.strip():
|
|
52
|
+
continue
|
|
53
|
+
|
|
54
|
+
for parser_class, field_name, _ in parsers:
|
|
55
|
+
# Skip if we already have this field
|
|
56
|
+
if field_name in results:
|
|
57
|
+
continue
|
|
58
|
+
|
|
59
|
+
# Try to parse
|
|
60
|
+
try:
|
|
61
|
+
result = parser_class.parse(line)
|
|
62
|
+
if result is not None:
|
|
63
|
+
# Handle NameParser's tuple return
|
|
64
|
+
if field_name == "name" and isinstance(result, tuple):
|
|
65
|
+
first, last = result
|
|
66
|
+
if first or last:
|
|
67
|
+
results["first_name"] = ParseResult(
|
|
68
|
+
value=first,
|
|
69
|
+
confidence=0.95,
|
|
70
|
+
consumed_text=line,
|
|
71
|
+
source="parsed"
|
|
72
|
+
)
|
|
73
|
+
results["last_name"] = ParseResult(
|
|
74
|
+
value=last,
|
|
75
|
+
confidence=0.95,
|
|
76
|
+
consumed_text=line,
|
|
77
|
+
source="parsed"
|
|
78
|
+
)
|
|
79
|
+
else:
|
|
80
|
+
results[field_name] = ParseResult(
|
|
81
|
+
value=result,
|
|
82
|
+
confidence=0.95,
|
|
83
|
+
consumed_text=line,
|
|
84
|
+
source="parsed"
|
|
85
|
+
)
|
|
86
|
+
break # Line consumed
|
|
87
|
+
except Exception:
|
|
88
|
+
# Parser failed, try next
|
|
89
|
+
continue
|
|
90
|
+
|
|
91
|
+
# Apply cross-field inference
|
|
92
|
+
cls._apply_inference(results)
|
|
93
|
+
|
|
94
|
+
return results
|
|
95
|
+
|
|
96
|
+
@classmethod
|
|
97
|
+
def _apply_inference(cls, results: Dict[str, ParseResult]) -> None:
|
|
98
|
+
"""Apply cross-field inference after parsing.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
results: Dict of parsed results (modified in place)
|
|
102
|
+
"""
|
|
103
|
+
# Infer company from email domain
|
|
104
|
+
if "company" not in results and "email" in results:
|
|
105
|
+
email = results["email"].value
|
|
106
|
+
if email and "@" in email:
|
|
107
|
+
domain = email.split("@")[1].lower()
|
|
108
|
+
if domain not in WEBMAIL_DOMAINS:
|
|
109
|
+
company = domain.split(".")[0].title()
|
|
110
|
+
results["company"] = ParseResult(
|
|
111
|
+
value=company,
|
|
112
|
+
confidence=0.6,
|
|
113
|
+
consumed_text=domain,
|
|
114
|
+
source="inferred"
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
@classmethod
|
|
118
|
+
def to_dict(cls, results: Dict[str, ParseResult]) -> Dict[str, any]:
|
|
119
|
+
"""Convert ParseResults to simple dict of values.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
results: Dict mapping field names to ParseResults
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
Dict mapping field names to values
|
|
126
|
+
"""
|
|
127
|
+
return {k: v.value for k, v in results.items()}
|