jodie 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,57 @@
1
+ import re
2
+ from typing import List
3
+
4
+ class SignaturePreprocessor:
5
+ """Preprocess email signatures before parsing."""
6
+
7
+ # Noise patterns to filter out
8
+ NOISE_PATTERNS = [
9
+ r'^>.*$', # Quoted replies
10
+ r'^On .* wrote:$', # Forwarded headers
11
+ r'^-+\s*Original Message\s*-+$',
12
+ r'^From:.*$',
13
+ r'^Sent:.*$',
14
+ r'^To:.*$',
15
+ r'^Subject:.*$',
16
+ ]
17
+
18
+ # Pronouns to strip from names
19
+ PRONOUNS = r'\s*\([^)]*(?:he|she|they|him|her|them)[^)]*\)\s*'
20
+
21
+ # Common separators
22
+ SEPARATORS = r'[•·|–—]'
23
+
24
+ @classmethod
25
+ def preprocess(cls, text: str) -> List[str]:
26
+ """Clean and split signature text into parseable tokens."""
27
+ if not text:
28
+ return []
29
+
30
+ lines = []
31
+ for line in text.strip().split('\n'):
32
+ line = line.strip()
33
+
34
+ # Skip empty lines
35
+ if not line:
36
+ continue
37
+
38
+ # Skip noise patterns
39
+ skip = False
40
+ for pattern in cls.NOISE_PATTERNS:
41
+ if re.match(pattern, line, re.IGNORECASE):
42
+ skip = True
43
+ break
44
+ if skip:
45
+ continue
46
+
47
+ # Strip pronouns
48
+ line = re.sub(cls.PRONOUNS, '', line, flags=re.IGNORECASE)
49
+
50
+ # Split by common separators
51
+ parts = re.split(cls.SEPARATORS, line)
52
+ for part in parts:
53
+ part = part.strip()
54
+ if part:
55
+ lines.append(part)
56
+
57
+ return lines
jodie/input/stdin.py ADDED
@@ -0,0 +1,7 @@
1
+ import sys
2
+
3
+ def read_stdin() -> str:
4
+ """Read from stdin (for heredoc/pipe)."""
5
+ if sys.stdin.isatty():
6
+ raise RuntimeError("No input provided via stdin")
7
+ return sys.stdin.read()
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/env python3
2
+ # jodie/parsers/__init__.py
3
+ from jodie.parsers.parsers import (
4
+ BaseParser,
5
+ EmailParser,
6
+ NameParser,
7
+ WebsiteParser,
8
+ TitleParser,
9
+ PhoneParser
10
+ )
11
+ from jodie.parsers.base import ParseResult
12
+ from jodie.parsers.pipeline import ParserPipeline
13
+
14
+ __all__ = (
15
+ "BaseParser",
16
+ "EmailParser",
17
+ "NameParser",
18
+ "WebsiteParser",
19
+ "TitleParser",
20
+ "PhoneParser",
21
+ "ParseResult",
22
+ "ParserPipeline"
23
+ )
jodie/parsers/base.py ADDED
@@ -0,0 +1,69 @@
1
+ #!/usr/bin/env python3
2
+ # jodie/parsers/base.py
3
+ """Base classes for parser plugin architecture."""
4
+
5
+ from abc import ABC, abstractmethod
6
+ from dataclasses import dataclass
7
+ from typing import Optional, Any
8
+
9
+
10
+ @dataclass
11
+ class ParseResult:
12
+ """Result from a parser."""
13
+ value: Any
14
+ confidence: float # 0.0 to 1.0
15
+ consumed_text: str
16
+ source: str = "parsed" # "parsed", "inferred", "explicit"
17
+
18
+ def __repr__(self) -> str:
19
+ """Developer-friendly representation showing value and confidence."""
20
+ conf_pct = f"{self.confidence:.0%}"
21
+ return f"ParseResult({self.value!r}, {conf_pct}, source={self.source!r})"
22
+
23
+ def __str__(self) -> str:
24
+ """User-friendly representation showing value with confidence indicator."""
25
+ conf_pct = f"{self.confidence:.0%}"
26
+ return f"{self.value} ({conf_pct})"
27
+
28
+
29
+ class BaseParser(ABC):
30
+ """Abstract base class for all parsers.
31
+
32
+ To create a new parser:
33
+ 1. Create a new file in jodie/parsers/
34
+ 2. Subclass BaseParser
35
+ 3. Set name, priority, and field_name class attributes
36
+ 4. Implement the parse() classmethod
37
+
38
+ Parsers are auto-discovered by the Pipeline.
39
+ """
40
+ name: str = ""
41
+ priority: int = 0 # Higher = runs first
42
+ field_name: str = ""
43
+
44
+ @classmethod
45
+ @abstractmethod
46
+ def parse(cls, text: str) -> Optional[ParseResult]:
47
+ """Parse text and return result or None if no match.
48
+
49
+ Args:
50
+ text: Input text to parse
51
+
52
+ Returns:
53
+ ParseResult if match found, None otherwise
54
+ """
55
+ pass
56
+
57
+ @staticmethod
58
+ def find_matches(pattern: str, text: str) -> list:
59
+ """Find all regex matches in text.
60
+
61
+ Args:
62
+ pattern: Regex pattern
63
+ text: Text to search
64
+
65
+ Returns:
66
+ List of matches
67
+ """
68
+ import re
69
+ return re.findall(pattern, text)
@@ -0,0 +1,243 @@
1
+ #!/usr/bin/env python3
2
+ # jodie/parsers/parsers.py
3
+ import re
4
+ from nameparser import HumanName
5
+
6
+ class BaseParser:
7
+ """
8
+ Base class for parsing text into specific contact properties.
9
+ """
10
+
11
+ def __init__(self):
12
+ pass
13
+
14
+ @staticmethod
15
+ def find_matches(pattern, text):
16
+ """
17
+ Find all matches of a regex pattern in the given text.
18
+
19
+ :param pattern: Regex pattern to match against the text.
20
+ :param text: The text to search in.
21
+ :return: A list of all matches.
22
+ """
23
+ return re.findall(pattern, text)
24
+
25
+ @classmethod
26
+ def parse(cls, text):
27
+ """
28
+ Parse the given text to extract information.
29
+ This method should be implemented by subclasses.
30
+
31
+ :param text: The text to parse.
32
+ :return: Extracted information.
33
+ """
34
+ raise NotImplementedError("Subclasses must implement this method.")
35
+
36
+
37
+ class EmailParser(BaseParser):
38
+ @classmethod
39
+ def parse(cls, text):
40
+ """
41
+ Extracts a single email from the text.
42
+ Returns the first valid match or None.
43
+ """
44
+ email_pattern = r'<(\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b)>' \
45
+ r'|(\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b)'
46
+ matches = cls.find_matches(email_pattern, text)
47
+ for match in matches:
48
+ # Return the first non-empty match
49
+ if isinstance(match, tuple):
50
+ return next((item for item in match if item), None)
51
+ return match
52
+ return None
53
+
54
+
55
+ class TitleParser(BaseParser):
56
+ # Common job titles and their variations
57
+ COMMON_TITLES = {
58
+ # C-level
59
+ "ceo", "cto", "cfo", "coo", "chief executive officer", "chief technology officer",
60
+ "chief financial officer", "chief operating officer",
61
+
62
+ # Founder/Leadership
63
+ "founder", "co-founder", "cofounder", "president", "vice president", "vp",
64
+ "director", "head", "lead", "principal",
65
+
66
+ # Engineering
67
+ "engineer", "developer", "architect", "scientist", "researcher",
68
+ "software engineer", "senior engineer", "staff engineer", "full stack engineer",
69
+ "senior full stack engineer", "senior software engineer",
70
+
71
+ # Management
72
+ "manager", "supervisor", "coordinator", "specialist", "analyst",
73
+ "project manager", "product manager", "program manager",
74
+
75
+ # Business
76
+ "consultant", "advisor", "partner", "associate", "representative",
77
+ "business development", "sales", "marketing",
78
+
79
+ # Academic
80
+ "professor", "lecturer", "researcher", "fellow", "scholar",
81
+
82
+ # Creative
83
+ "designer", "artist", "writer", "editor", "producer",
84
+
85
+ # Other
86
+ "intern", "assistant", "coordinator", "specialist"
87
+ }
88
+
89
+ # Acronyms that should stay uppercase
90
+ ACRONYMS = {"CEO", "CTO", "CFO", "COO", "VP"}
91
+
92
+ # Common title prefixes
93
+ PREFIXES = {
94
+ "senior", "lead", "principal", "staff", "associate", "junior",
95
+ "full stack", "front end", "back end", "full-stack", "front-end", "back-end"
96
+ }
97
+
98
+ @classmethod
99
+ def parse(cls, text):
100
+ """
101
+ Enhanced title parsing:
102
+ - Standardizes capitalization for common titles
103
+ - Handles variations and compound titles
104
+ - Returns None for non-title text
105
+ - Preserves acronym case (e.g., CEO stays as CEO)
106
+ """
107
+ if not isinstance(text, str) or not text.strip():
108
+ return None
109
+
110
+ # Normalize text for comparison
111
+ cleaned_text = text.strip().lower()
112
+
113
+ # Direct match against common titles
114
+ if cleaned_text in cls.COMMON_TITLES:
115
+ # Preserve original case for acronyms
116
+ if text.upper() in cls.ACRONYMS:
117
+ return text.upper()
118
+ # Preserve original case for the entire title
119
+ return text
120
+
121
+ # Check for compound titles with prefixes
122
+ words = cleaned_text.split()
123
+ if len(words) >= 2:
124
+ # First check if the entire phrase is a known title
125
+ if cleaned_text in cls.COMMON_TITLES:
126
+ return text
127
+
128
+ # Then check for prefix + title combinations
129
+ for i in range(len(words)):
130
+ prefix = " ".join(words[:i+1])
131
+ if prefix in cls.PREFIXES:
132
+ remaining = " ".join(words[i+1:])
133
+ if remaining in cls.COMMON_TITLES:
134
+ return text
135
+
136
+ # Finally check if any part matches a known title
137
+ for i in range(len(words)):
138
+ for j in range(i + 1, len(words) + 1):
139
+ phrase = " ".join(words[i:j])
140
+ if phrase in cls.COMMON_TITLES:
141
+ return text
142
+
143
+ return None
144
+
145
+
146
+ class WebsiteParser(BaseParser):
147
+ @classmethod
148
+ def parse(cls, text):
149
+ # Match typical URL patterns
150
+ url_pattern = r'https?://[^\s]+|www\.[^\s]+'
151
+ urls = cls.find_matches(url_pattern, text)
152
+ return urls[0] if urls else None
153
+
154
+
155
+ class PhoneParser(BaseParser):
156
+ @classmethod
157
+ def parse(cls, text):
158
+ """
159
+ Parse a phone number from the given text.
160
+ Supports various phone number formats:
161
+ - 10 digits: 5167763192
162
+ - With country code: +15167763192, 15167763192
163
+ - With separators: 516-776-3192, 516.776.3192, (516) 776-3192
164
+ - International formats: +1 516 776 3192
165
+
166
+ Returns the cleaned phone number or None if no valid phone found.
167
+ """
168
+ if not isinstance(text, str) or not text.strip():
169
+ return None
170
+
171
+ # Remove common non-digit characters for analysis
172
+ digits_only = ''.join(ch for ch in text if ch.isdigit())
173
+
174
+ # Various phone number patterns
175
+ phone_patterns = [
176
+ # US phone numbers with various formatting
177
+ r'\+?1?[-.\s]?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})\b',
178
+ # 10 digit number
179
+ r'\b([0-9]{10})\b',
180
+ # International format with country code
181
+ r'\+?([0-9]{1,3})[-.\s]?([0-9]{3,4})[-.\s]?([0-9]{3,4})[-.\s]?([0-9]{3,4})\b'
182
+ ]
183
+
184
+ for pattern in phone_patterns:
185
+ matches = cls.find_matches(pattern, text)
186
+ if matches:
187
+ # For tuple matches, join the groups
188
+ if isinstance(matches[0], tuple):
189
+ # Remove empty groups and join
190
+ phone_digits = ''.join(group for group in matches[0] if group)
191
+ else:
192
+ phone_digits = matches[0]
193
+
194
+ # Validate phone number length
195
+ if len(phone_digits) == 10:
196
+ # US phone number without country code
197
+ return phone_digits
198
+ elif len(phone_digits) == 11 and phone_digits.startswith('1'):
199
+ # US phone number with country code
200
+ return phone_digits[1:] # Return without country code
201
+ elif 7 <= len(phone_digits) <= 15:
202
+ # International phone number
203
+ return phone_digits
204
+
205
+ # Fallback: if we have exactly 10 or 11 digits, treat as phone
206
+ if len(digits_only) == 10:
207
+ return digits_only
208
+ elif len(digits_only) == 11 and digits_only.startswith('1'):
209
+ return digits_only[1:]
210
+
211
+ return None
212
+
213
+
214
+ class NameParser(BaseParser):
215
+ @classmethod
216
+ def parse(cls, text):
217
+ """
218
+ Parse a name from the given text.
219
+ - If an email is present in mailbox format, extract the portion before the email.
220
+ - Use the `nameparser` package to parse the extracted name.
221
+
222
+ :param text: The text to parse.
223
+ :return: A tuple of (first_name, last_name).
224
+ """
225
+ # Check for email in the text.
226
+ email = EmailParser.parse(text)
227
+ if email:
228
+ # Extract the portion of text before the email for name parsing.
229
+ name_portion_index = text.find(email)
230
+ name_portion = text[:name_portion_index].strip()
231
+ name_portion = name_portion.rstrip('<').strip()
232
+ else:
233
+ # Use the entire text if no email is found.
234
+ name_portion = text.strip()
235
+
236
+ if not name_portion:
237
+ return "", ""
238
+
239
+ # Use the `HumanName` class to parse the name intelligently.
240
+ name = HumanName(name_portion)
241
+
242
+ # Return the first and last names as a tuple.
243
+ return name.first, f"{name.middle} {name.last}".strip()
@@ -0,0 +1,127 @@
1
+ #!/usr/bin/env python3
2
+ # jodie/parsers/pipeline.py
3
+ """Parser pipeline for orchestrating field extraction."""
4
+
5
+ from typing import List, Dict
6
+ from .base import ParseResult
7
+ from jodie.constants import WEBMAIL_DOMAINS
8
+
9
+
10
+ class ParserPipeline:
11
+ """Orchestrates parsers to extract contact fields from text.
12
+
13
+ The pipeline:
14
+ 1. Runs parsers in priority order
15
+ 2. Stops processing a line when a parser matches
16
+ 3. Applies cross-field inference after all parsing
17
+ """
18
+
19
+ @classmethod
20
+ def parse(cls, lines: List[str]) -> Dict[str, ParseResult]:
21
+ """Run all parsers on input lines.
22
+
23
+ Args:
24
+ lines: List of text lines to parse
25
+
26
+ Returns:
27
+ Dict mapping field names to ParseResults
28
+ """
29
+ # Import parsers here to avoid circular imports
30
+ from .parsers import (
31
+ EmailParser, PhoneParser, WebsiteParser,
32
+ TitleParser, NameParser
33
+ )
34
+
35
+ results: Dict[str, ParseResult] = {}
36
+
37
+ # Parser configuration: (parser_class, field_name, priority)
38
+ # Higher priority = runs first
39
+ parsers = [
40
+ (EmailParser, "email", 100),
41
+ (WebsiteParser, "websites", 90),
42
+ (TitleParser, "job_title", 70),
43
+ (PhoneParser, "phone", 60),
44
+ (NameParser, "name", 50),
45
+ ]
46
+
47
+ # Sort by priority (highest first)
48
+ parsers.sort(key=lambda x: x[2], reverse=True)
49
+
50
+ for line in lines:
51
+ if not line or not line.strip():
52
+ continue
53
+
54
+ for parser_class, field_name, _ in parsers:
55
+ # Skip if we already have this field
56
+ if field_name in results:
57
+ continue
58
+
59
+ # Try to parse
60
+ try:
61
+ result = parser_class.parse(line)
62
+ if result is not None:
63
+ # Handle NameParser's tuple return
64
+ if field_name == "name" and isinstance(result, tuple):
65
+ first, last = result
66
+ if first or last:
67
+ results["first_name"] = ParseResult(
68
+ value=first,
69
+ confidence=0.95,
70
+ consumed_text=line,
71
+ source="parsed"
72
+ )
73
+ results["last_name"] = ParseResult(
74
+ value=last,
75
+ confidence=0.95,
76
+ consumed_text=line,
77
+ source="parsed"
78
+ )
79
+ else:
80
+ results[field_name] = ParseResult(
81
+ value=result,
82
+ confidence=0.95,
83
+ consumed_text=line,
84
+ source="parsed"
85
+ )
86
+ break # Line consumed
87
+ except Exception:
88
+ # Parser failed, try next
89
+ continue
90
+
91
+ # Apply cross-field inference
92
+ cls._apply_inference(results)
93
+
94
+ return results
95
+
96
+ @classmethod
97
+ def _apply_inference(cls, results: Dict[str, ParseResult]) -> None:
98
+ """Apply cross-field inference after parsing.
99
+
100
+ Args:
101
+ results: Dict of parsed results (modified in place)
102
+ """
103
+ # Infer company from email domain
104
+ if "company" not in results and "email" in results:
105
+ email = results["email"].value
106
+ if email and "@" in email:
107
+ domain = email.split("@")[1].lower()
108
+ if domain not in WEBMAIL_DOMAINS:
109
+ company = domain.split(".")[0].title()
110
+ results["company"] = ParseResult(
111
+ value=company,
112
+ confidence=0.6,
113
+ consumed_text=domain,
114
+ source="inferred"
115
+ )
116
+
117
+ @classmethod
118
+ def to_dict(cls, results: Dict[str, ParseResult]) -> Dict[str, any]:
119
+ """Convert ParseResults to simple dict of values.
120
+
121
+ Args:
122
+ results: Dict mapping field names to ParseResults
123
+
124
+ Returns:
125
+ Dict mapping field names to values
126
+ """
127
+ return {k: v.value for k, v in results.items()}