gibsondedup 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. gibsondedup-0.1.0/PKG-INFO +322 -0
  2. gibsondedup-0.1.0/README.md +309 -0
  3. gibsondedup-0.1.0/gibsondedup/__init__.py +0 -0
  4. gibsondedup-0.1.0/gibsondedup/api/__init__.py +0 -0
  5. gibsondedup-0.1.0/gibsondedup/cache/__init__.py +0 -0
  6. gibsondedup-0.1.0/gibsondedup/canonicalizer/__init__.py +0 -0
  7. gibsondedup-0.1.0/gibsondedup/canonicalizer/canonicalizer.py +24 -0
  8. gibsondedup-0.1.0/gibsondedup/config/settings.py +0 -0
  9. gibsondedup-0.1.0/gibsondedup/contracts/__init__.py +0 -0
  10. gibsondedup-0.1.0/gibsondedup/contracts/models.py +32 -0
  11. gibsondedup-0.1.0/gibsondedup/errors/__init__.py +0 -0
  12. gibsondedup-0.1.0/gibsondedup/grouping/__init__.py +0 -0
  13. gibsondedup-0.1.0/gibsondedup/grouping/grouping.py +17 -0
  14. gibsondedup-0.1.0/gibsondedup/grouping/merger.py +77 -0
  15. gibsondedup-0.1.0/gibsondedup/main.py +4 -0
  16. gibsondedup-0.1.0/gibsondedup/metrics/__init__.py +0 -0
  17. gibsondedup-0.1.0/gibsondedup/normalizer/__init__.py +0 -0
  18. gibsondedup-0.1.0/gibsondedup/normalizer/similarity.py +20 -0
  19. gibsondedup-0.1.0/gibsondedup/normalizer/url_normalizer.py +52 -0
  20. gibsondedup-0.1.0/gibsondedup/orchestration/__init__.py +0 -0
  21. gibsondedup-0.1.0/gibsondedup/orchestration/pipeline.py +62 -0
  22. gibsondedup-0.1.0/gibsondedup/parser/__init__.py +0 -0
  23. gibsondedup-0.1.0/gibsondedup/parser/json_parser.py +39 -0
  24. gibsondedup-0.1.0/gibsondedup.egg-info/PKG-INFO +322 -0
  25. gibsondedup-0.1.0/gibsondedup.egg-info/SOURCES.txt +37 -0
  26. gibsondedup-0.1.0/gibsondedup.egg-info/dependency_links.txt +1 -0
  27. gibsondedup-0.1.0/gibsondedup.egg-info/entry_points.txt +2 -0
  28. gibsondedup-0.1.0/gibsondedup.egg-info/top_level.txt +5 -0
  29. gibsondedup-0.1.0/pyproject.toml +26 -0
  30. gibsondedup-0.1.0/setup.cfg +4 -0
  31. gibsondedup-0.1.0/setup.py +16 -0
  32. gibsondedup-0.1.0/tests/__init__.py +0 -0
  33. gibsondedup-0.1.0/tests/fixtures.py +0 -0
  34. gibsondedup-0.1.0/tests/test_canonicalizer.py +131 -0
  35. gibsondedup-0.1.0/tests/test_grouping.py +50 -0
  36. gibsondedup-0.1.0/tests/test_merger.py +117 -0
  37. gibsondedup-0.1.0/tests/test_normalizer.py +55 -0
  38. gibsondedup-0.1.0/tests/test_pipeline.py +141 -0
  39. gibsondedup-0.1.0/tests/test_similarity.py +61 -0
@@ -0,0 +1,322 @@
1
+ Metadata-Version: 2.4
2
+ Name: gibsondedup
3
+ Version: 0.1.0
4
+ Summary: A high-performance search result deduplication engine that groups duplicate results by canonical URL and semantic similarity, returning clean batches with the most authoritative source and full traceability to all original sources.
5
+ Author: Gibson Kwabena Aseda Mensah
6
+ Author-email: logosyninc@gmail.com
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: Operating System :: OS Independent
9
+ Requires-Python: >=3.8
10
+ Description-Content-Type: text/markdown
11
+ Dynamic: author-email
12
+ Dynamic: requires-python
13
+
14
+ # gibsondedup
15
+
16
+ A high-performance search result deduplication engine that groups duplicate results by canonical URL and semantic similarity, returning clean batches with the most authoritative source and full traceability to all original sources.
17
+
18
+ ---
19
+
20
+ ## The Problem
21
+
22
+ When aggregating search results from multiple providers (Google, Bing, DuckDuckGo), 40–60% of results are duplicates — same content, different URLs, different titles, different tracking parameters.
23
+
24
+ ```
25
+ Source 1 (Google):
26
+ url: https://www.python.org/docs?utm_source=google
27
+ title: "Python Official Documentation"
28
+
29
+ Source 2 (Bing):
30
+ url: http://python.org/docs/
31
+ title: "Python Docs"
32
+
33
+ Source 3 (DuckDuckGo):
34
+ url: https://python.org/docs?utm_medium=cpc
35
+ title: "Learn Python - Official Docs"
36
+ ```
37
+
38
+ These are the same resource. Without deduplication, users see noise instead of signal.
39
+
40
+ ---
41
+
42
+ ## The Solution
43
+
44
+ `gibsondedup` processes raw search results through a multi-stage pipeline:
45
+
46
+ ```
47
+ Raw Results
48
+
49
+ Parse & Validate (malformed records skipped gracefully)
50
+
51
+ URL Normalization (remove tracking params, www, trailing slashes)
52
+
53
+ Exact URL Grouping (hash-based, O(n) complexity)
54
+
55
+ Semantic Merge (Jaccard similarity, configurable threshold)
56
+
57
+ Canonicalization (select best result per group)
58
+
59
+ Clean Output (deduplicated results + metadata)
60
+ ```
61
+
62
+ **Result:** 100 noisy inputs → 60–70 clean, canonical results.
63
+
64
+ ---
65
+
66
+ ## Installation
67
+
68
+ ```bash
69
+ pip install gibsondedup
70
+ ```
71
+
72
+ ---
73
+
74
+ ## Quick Start
75
+
76
+ ```python
77
+ from app.orchestration.pipeline import DeduplicationEngine
78
+
79
+ engine = DeduplicationEngine()
80
+
81
+ results = engine.process([
82
+ {
83
+ "title": "Python Official Documentation",
84
+ "url": "https://www.python.org/docs?utm_source=google",
85
+ "description": "Official Python docs",
86
+ "source": "google"
87
+ },
88
+ {
89
+ "title": "Python Docs",
90
+ "url": "http://python.org/docs/",
91
+ "description": "Python documentation",
92
+ "source": "bing"
93
+ },
94
+ {
95
+ "title": "Stack Overflow Python",
96
+ "url": "https://stackoverflow.com/questions/tagged/python",
97
+ "description": "Python questions",
98
+ "source": "google"
99
+ },
100
+ ])
101
+
102
+ print(results)
103
+ ```
104
+
105
+ **Output:**
106
+
107
+ ```json
108
+ {
109
+ "results": [
110
+ {
111
+ "title": "Python Official Documentation",
112
+ "url": "https://www.python.org/docs?utm_source=google",
113
+ "canonical_url": "python.org/docs",
114
+ "description": "Official Python docs",
115
+ "sources": ["google", "bing"],
116
+ "duplicates_removed": 1
117
+ },
118
+ {
119
+ "title": "Stack Overflow Python",
120
+ "url": "https://stackoverflow.com/questions/tagged/python",
121
+ "canonical_url": "stackoverflow.com/questions/tagged/python",
122
+ "description": "Python questions",
123
+ "sources": ["google"],
124
+ "duplicates_removed": 0
125
+ }
126
+ ],
127
+ "meta": {
128
+ "total_input": 3,
129
+ "total_parsed": 3,
130
+ "total_output": 2,
131
+ "duplicates_removed": 1,
132
+ "processing_time_ms": 0.09,
133
+ "similarity_threshold": 0.8
134
+ }
135
+ }
136
+ ```
137
+
138
+ ---
139
+
140
+ ## Configuration
141
+
142
+ ### Custom Similarity Threshold
143
+
144
+ Control how aggressively similar-titled results are merged:
145
+
146
+ ```python
147
+ # Conservative (default) — only merge highly similar titles
148
+ engine = DeduplicationEngine(similarity_threshold=0.8)
149
+
150
+ # Moderate — merge titles with moderate overlap
151
+ engine = DeduplicationEngine(similarity_threshold=0.5)
152
+
153
+ # Aggressive — merge titles with minimal overlap
154
+ engine = DeduplicationEngine(similarity_threshold=0.3)
155
+ ```
156
+
157
+ ### Input Contract
158
+
159
+ Each result in the input array accepts:
160
+
161
+ | Field | Type | Required | Description |
162
+ |-------|------|----------|-------------|
163
+ | `title` | string | ✅ | Result title |
164
+ | `url` | string | ✅ | Result URL |
165
+ | `description` | string | ❌ | Result description |
166
+ | `source` | string | ❌ | Source provider (google, bing, etc.) |
167
+
168
+ Malformed records (missing title or URL) are **skipped gracefully** — the pipeline continues processing valid records.
169
+
170
+ ---
171
+
172
+ ## Architecture
173
+
174
+ ### Pipeline Stages
175
+
176
+ **Stage 1: Parser**
177
+ Converts raw JSON payloads into structured internal contracts. Validates required fields. Skips malformed records without crashing.
178
+
179
+ **Stage 2: URL Normalizer**
180
+ Converts URLs to canonical form by:
181
+ - Removing protocol (`http://`, `https://`)
182
+ - Removing `www` prefix
183
+ - Removing tracking parameters (`utm_*`, `fbclid`, `gclid`)
184
+ - Removing trailing slashes
185
+ - Stripping default ports (80, 443)
186
+ - Preserving meaningful query parameters
187
+ - Lowercasing everything
188
+
189
+ **Stage 3: Exact Grouping**
190
+ Groups results by normalized canonical URL using a hash map. Time complexity: O(n). Avoids the O(n²) pairwise comparison trap.
191
+
192
+ **Stage 4: Semantic Merger**
193
+ Within groups sharing the same domain, compares representative titles using Jaccard similarity. Groups exceeding the similarity threshold are merged into one. Only compares within the same exact domain — `python.org` and `docs.python.org` are treated as separate domains.
194
+
195
+ **Stage 5: Canonicalizer**
196
+ For each group, selects the single best result using a combined title + description length heuristic. Collects source traceability from all merged results.
197
+
198
+ ### Data Contracts
199
+
200
+ ```python
201
+ # Input
202
+ RawSearchResult(title, url, description?, source?)
203
+
204
+ # Internal
205
+ NormalizedSearchResult(title, url, canonical_url, description, source)
206
+
207
+ # Output
208
+ CanonicalResult(title, url, canonical_url, description, sources[], duplicates_removed)
209
+ ```
210
+
211
+ ### System Invariants
212
+
213
+ 1. Same normalized URL → same duplicate group (always)
214
+ 2. Canonical results preserve source traceability (always)
215
+ 3. Pipeline stages do not mutate upstream data (always)
216
+ 4. Normalization happens once per result (always)
217
+
218
+ ---
219
+
220
+ ## Engineering Decisions
221
+
222
+ ### Why hash-based grouping (O(n)) over pairwise comparison (O(n²))?
223
+
224
+ With 1000 results, O(n²) means 1,000,000 comparisons. O(n) means 1000. At scale this is the difference between milliseconds and seconds.
225
+
226
+ ### Why exact domain matching for semantic merging?
227
+
228
+ `python.org/docs` and `docs.python.org` serve genuinely different content despite sharing a base domain. Exact domain matching prevents false merges while still catching real duplicates like `python.org/docs` and `python.org/reference`.
229
+
230
+ ### Why Jaccard similarity over Levenshtein distance?
231
+
232
+ Jaccard operates on token sets (words), making it robust to word order changes and additions. Levenshtein operates on character sequences, making it sensitive to trivial differences like "Docs" vs "Documentation".
233
+
234
+ ### Why keep normalization deterministic?
235
+
236
+ Same input must always produce same output. This ensures:
237
+ - Results are reproducible across runs
238
+ - Caching normalized URLs is safe
239
+ - Tests are reliable and meaningful
240
+
241
+ ---
242
+
243
+ ## Performance
244
+
245
+ | Input Size | Processing Time | Memory |
246
+ |------------|----------------|--------|
247
+ | 100 results | ~1ms | ~1MB |
248
+ | 1,000 results | ~10ms | ~5MB |
249
+ | 10,000 results | ~100ms | ~50MB |
250
+
251
+ *Benchmarked on Ubuntu 22.04, Python 3.10, Intel i5*
252
+
253
+ ---
254
+
255
+ ## Testing
256
+
257
+ ```bash
258
+ # Install dev dependencies
259
+ pip install pytest pytest-cov
260
+
261
+ # Run all tests
262
+ pytest tests/ -v
263
+
264
+ # Run with coverage
265
+ pytest tests/ --cov=app --cov-report=term-missing
266
+ ```
267
+
268
+ **Test coverage: 59 tests across 5 modules**
269
+
270
+ ```
271
+ tests/test_normalizer.py 14 tests (URL normalization)
272
+ tests/test_grouping.py 4 tests (exact grouping)
273
+ tests/test_canonicalizer.py 7 tests (best result selection)
274
+ tests/test_merger.py 11 tests (semantic merging)
275
+ tests/test_similarity.py 13 tests (Jaccard similarity)
276
+ tests/test_pipeline.py 10 tests (end-to-end pipeline)
277
+ ```
278
+
279
+ ---
280
+
281
+ ## Roadmap
282
+
283
+ ### Phase 1 (Complete)
284
+ - URL normalization
285
+ - Exact duplicate grouping
286
+ - Canonicalization with source traceability
287
+
288
+ ### Phase 2 (Complete)
289
+ - Jaccard similarity engine
290
+ - Domain-based semantic merging
291
+ - Configurable similarity threshold
292
+
293
+ ### Phase 3 (Planned)
294
+ - Persistent caching (Redis)
295
+ - Database storage (PostgreSQL)
296
+ - REST API exposure
297
+ - Rails wrapper (gem)
298
+
299
+ ### Phase 4 (Planned)
300
+ - Semantic embeddings (replace Jaccard with vector similarity)
301
+ - Distributed processing
302
+ - Production observability
303
+
304
+ ---
305
+
306
+ ## License
307
+
308
+ MIT License. See `LICENSE` file.
309
+
310
+ ---
311
+
312
+ ## Author
313
+
314
+ **Aseda Gibson**
315
+ Computer Engineering, University of Energy and Natural Resources (UENR), Ghana.
316
+ Backend systems, distributed architecture, infrastructure tooling.
317
+
318
+ GitHub: [github.com/asedagibson](https://github.com/asedagibson)
319
+
320
+ ---
321
+
322
+ *Built with correctness-oriented engineering. Every architectural decision is documented. Every invariant is tested.*
@@ -0,0 +1,309 @@
1
+ # gibsondedup
2
+
3
+ A high-performance search result deduplication engine that groups duplicate results by canonical URL and semantic similarity, returning clean batches with the most authoritative source and full traceability to all original sources.
4
+
5
+ ---
6
+
7
+ ## The Problem
8
+
9
+ When aggregating search results from multiple providers (Google, Bing, DuckDuckGo), 40–60% of results are duplicates — same content, different URLs, different titles, different tracking parameters.
10
+
11
+ ```
12
+ Source 1 (Google):
13
+ url: https://www.python.org/docs?utm_source=google
14
+ title: "Python Official Documentation"
15
+
16
+ Source 2 (Bing):
17
+ url: http://python.org/docs/
18
+ title: "Python Docs"
19
+
20
+ Source 3 (DuckDuckGo):
21
+ url: https://python.org/docs?utm_medium=cpc
22
+ title: "Learn Python - Official Docs"
23
+ ```
24
+
25
+ These are the same resource. Without deduplication, users see noise instead of signal.
26
+
27
+ ---
28
+
29
+ ## The Solution
30
+
31
+ `gibsondedup` processes raw search results through a multi-stage pipeline:
32
+
33
+ ```
34
+ Raw Results
35
+
36
+ Parse & Validate (malformed records skipped gracefully)
37
+
38
+ URL Normalization (remove tracking params, www, trailing slashes)
39
+
40
+ Exact URL Grouping (hash-based, O(n) complexity)
41
+
42
+ Semantic Merge (Jaccard similarity, configurable threshold)
43
+
44
+ Canonicalization (select best result per group)
45
+
46
+ Clean Output (deduplicated results + metadata)
47
+ ```
48
+
49
+ **Result:** 100 noisy inputs → 60–70 clean, canonical results.
50
+
51
+ ---
52
+
53
+ ## Installation
54
+
55
+ ```bash
56
+ pip install gibsondedup
57
+ ```
58
+
59
+ ---
60
+
61
+ ## Quick Start
62
+
63
+ ```python
64
+ from app.orchestration.pipeline import DeduplicationEngine
65
+
66
+ engine = DeduplicationEngine()
67
+
68
+ results = engine.process([
69
+ {
70
+ "title": "Python Official Documentation",
71
+ "url": "https://www.python.org/docs?utm_source=google",
72
+ "description": "Official Python docs",
73
+ "source": "google"
74
+ },
75
+ {
76
+ "title": "Python Docs",
77
+ "url": "http://python.org/docs/",
78
+ "description": "Python documentation",
79
+ "source": "bing"
80
+ },
81
+ {
82
+ "title": "Stack Overflow Python",
83
+ "url": "https://stackoverflow.com/questions/tagged/python",
84
+ "description": "Python questions",
85
+ "source": "google"
86
+ },
87
+ ])
88
+
89
+ print(results)
90
+ ```
91
+
92
+ **Output:**
93
+
94
+ ```json
95
+ {
96
+ "results": [
97
+ {
98
+ "title": "Python Official Documentation",
99
+ "url": "https://www.python.org/docs?utm_source=google",
100
+ "canonical_url": "python.org/docs",
101
+ "description": "Official Python docs",
102
+ "sources": ["google", "bing"],
103
+ "duplicates_removed": 1
104
+ },
105
+ {
106
+ "title": "Stack Overflow Python",
107
+ "url": "https://stackoverflow.com/questions/tagged/python",
108
+ "canonical_url": "stackoverflow.com/questions/tagged/python",
109
+ "description": "Python questions",
110
+ "sources": ["google"],
111
+ "duplicates_removed": 0
112
+ }
113
+ ],
114
+ "meta": {
115
+ "total_input": 3,
116
+ "total_parsed": 3,
117
+ "total_output": 2,
118
+ "duplicates_removed": 1,
119
+ "processing_time_ms": 0.09,
120
+ "similarity_threshold": 0.8
121
+ }
122
+ }
123
+ ```
124
+
125
+ ---
126
+
127
+ ## Configuration
128
+
129
+ ### Custom Similarity Threshold
130
+
131
+ Control how aggressively similar-titled results are merged:
132
+
133
+ ```python
134
+ # Conservative (default) — only merge highly similar titles
135
+ engine = DeduplicationEngine(similarity_threshold=0.8)
136
+
137
+ # Moderate — merge titles with moderate overlap
138
+ engine = DeduplicationEngine(similarity_threshold=0.5)
139
+
140
+ # Aggressive — merge titles with minimal overlap
141
+ engine = DeduplicationEngine(similarity_threshold=0.3)
142
+ ```
143
+
144
+ ### Input Contract
145
+
146
+ Each result in the input array accepts:
147
+
148
+ | Field | Type | Required | Description |
149
+ |-------|------|----------|-------------|
150
+ | `title` | string | ✅ | Result title |
151
+ | `url` | string | ✅ | Result URL |
152
+ | `description` | string | ❌ | Result description |
153
+ | `source` | string | ❌ | Source provider (google, bing, etc.) |
154
+
155
+ Malformed records (missing title or URL) are **skipped gracefully** — the pipeline continues processing valid records.
156
+
157
+ ---
158
+
159
+ ## Architecture
160
+
161
+ ### Pipeline Stages
162
+
163
+ **Stage 1: Parser**
164
+ Converts raw JSON payloads into structured internal contracts. Validates required fields. Skips malformed records without crashing.
165
+
166
+ **Stage 2: URL Normalizer**
167
+ Converts URLs to canonical form by:
168
+ - Removing protocol (`http://`, `https://`)
169
+ - Removing `www` prefix
170
+ - Removing tracking parameters (`utm_*`, `fbclid`, `gclid`)
171
+ - Removing trailing slashes
172
+ - Stripping default ports (80, 443)
173
+ - Preserving meaningful query parameters
174
+ - Lowercasing everything
175
+
176
+ **Stage 3: Exact Grouping**
177
+ Groups results by normalized canonical URL using a hash map. Time complexity: O(n). Avoids the O(n²) pairwise comparison trap.
178
+
179
+ **Stage 4: Semantic Merger**
180
+ Within groups sharing the same domain, compares representative titles using Jaccard similarity. Groups exceeding the similarity threshold are merged into one. Only compares within the same exact domain — `python.org` and `docs.python.org` are treated as separate domains.
181
+
182
+ **Stage 5: Canonicalizer**
183
+ For each group, selects the single best result using a combined title + description length heuristic. Collects source traceability from all merged results.
184
+
185
+ ### Data Contracts
186
+
187
+ ```python
188
+ # Input
189
+ RawSearchResult(title, url, description?, source?)
190
+
191
+ # Internal
192
+ NormalizedSearchResult(title, url, canonical_url, description, source)
193
+
194
+ # Output
195
+ CanonicalResult(title, url, canonical_url, description, sources[], duplicates_removed)
196
+ ```
197
+
198
+ ### System Invariants
199
+
200
+ 1. Same normalized URL → same duplicate group (always)
201
+ 2. Canonical results preserve source traceability (always)
202
+ 3. Pipeline stages do not mutate upstream data (always)
203
+ 4. Normalization happens once per result (always)
204
+
205
+ ---
206
+
207
+ ## Engineering Decisions
208
+
209
+ ### Why hash-based grouping (O(n)) over pairwise comparison (O(n²))?
210
+
211
+ With 1000 results, O(n²) means 1,000,000 comparisons. O(n) means 1000. At scale this is the difference between milliseconds and seconds.
212
+
213
+ ### Why exact domain matching for semantic merging?
214
+
215
+ `python.org/docs` and `docs.python.org` serve genuinely different content despite sharing a base domain. Exact domain matching prevents false merges while still catching real duplicates like `python.org/docs` and `python.org/reference`.
216
+
217
+ ### Why Jaccard similarity over Levenshtein distance?
218
+
219
+ Jaccard operates on token sets (words), making it robust to word order changes and additions. Levenshtein operates on character sequences, making it sensitive to trivial differences like "Docs" vs "Documentation".
220
+
221
+ ### Why keep normalization deterministic?
222
+
223
+ Same input must always produce same output. This ensures:
224
+ - Results are reproducible across runs
225
+ - Caching normalized URLs is safe
226
+ - Tests are reliable and meaningful
227
+
228
+ ---
229
+
230
+ ## Performance
231
+
232
+ | Input Size | Processing Time | Memory |
233
+ |------------|----------------|--------|
234
+ | 100 results | ~1ms | ~1MB |
235
+ | 1,000 results | ~10ms | ~5MB |
236
+ | 10,000 results | ~100ms | ~50MB |
237
+
238
+ *Benchmarked on Ubuntu 22.04, Python 3.10, Intel i5*
239
+
240
+ ---
241
+
242
+ ## Testing
243
+
244
+ ```bash
245
+ # Install dev dependencies
246
+ pip install pytest pytest-cov
247
+
248
+ # Run all tests
249
+ pytest tests/ -v
250
+
251
+ # Run with coverage
252
+ pytest tests/ --cov=app --cov-report=term-missing
253
+ ```
254
+
255
+ **Test coverage: 59 tests across 5 modules**
256
+
257
+ ```
258
+ tests/test_normalizer.py 14 tests (URL normalization)
259
+ tests/test_grouping.py 4 tests (exact grouping)
260
+ tests/test_canonicalizer.py 7 tests (best result selection)
261
+ tests/test_merger.py 11 tests (semantic merging)
262
+ tests/test_similarity.py 13 tests (Jaccard similarity)
263
+ tests/test_pipeline.py 10 tests (end-to-end pipeline)
264
+ ```
265
+
266
+ ---
267
+
268
+ ## Roadmap
269
+
270
+ ### Phase 1 (Complete)
271
+ - URL normalization
272
+ - Exact duplicate grouping
273
+ - Canonicalization with source traceability
274
+
275
+ ### Phase 2 (Complete)
276
+ - Jaccard similarity engine
277
+ - Domain-based semantic merging
278
+ - Configurable similarity threshold
279
+
280
+ ### Phase 3 (Planned)
281
+ - Persistent caching (Redis)
282
+ - Database storage (PostgreSQL)
283
+ - REST API exposure
284
+ - Rails wrapper (gem)
285
+
286
+ ### Phase 4 (Planned)
287
+ - Semantic embeddings (replace Jaccard with vector similarity)
288
+ - Distributed processing
289
+ - Production observability
290
+
291
+ ---
292
+
293
+ ## License
294
+
295
+ MIT License. See `LICENSE` file.
296
+
297
+ ---
298
+
299
+ ## Author
300
+
301
+ **Aseda Gibson**
302
+ Computer Engineering, University of Energy and Natural Resources (UENR), Ghana.
303
+ Backend systems, distributed architecture, infrastructure tooling.
304
+
305
+ GitHub: [github.com/asedagibson](https://github.com/asedagibson)
306
+
307
+ ---
308
+
309
+ *Built with correctness-oriented engineering. Every architectural decision is documented. Every invariant is tested.*
File without changes
File without changes
File without changes
@@ -0,0 +1,24 @@
1
+ from gibsondedup.contracts.models import NormalizedSearchResult, CanonicalResult
2
+ from typing import List, Dict
3
+
4
+ def canonicalize(groups: Dict[str, List[NormalizedSearchResult]]) -> List[CanonicalResult]:
5
+ canonical_results = []
6
+ for canonical_url, results in groups.items():
7
+ if not results:
8
+ continue
9
+ # Find the result with the longest description to use as the canonical result and longest title
10
+ canonical_result = max(results, key=lambda r: len(r.description or "") + len(r.title or ""))
11
+ # Create a CanonicalResult with the sources and count of duplicates removed
12
+ canonical_results.append(CanonicalResult(
13
+ title=canonical_result.title,
14
+ url=canonical_result.url,
15
+ canonical_url=canonical_result.canonical_url,
16
+ description=canonical_result.description,
17
+ sources=[r.source for r in results],
18
+ duplicates_removed=len(results) - 1
19
+ ))
20
+ return canonical_results
21
+
22
+ __all__ = [
23
+ "canonicalize"
24
+ ]
File without changes
File without changes
@@ -0,0 +1,32 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Optional
3
+
4
+ @dataclass
5
+ class RawSearchResult:
6
+ title: str # Required
7
+ url: str # Required
8
+ description: Optional[str] = "" # Optional
9
+ source: Optional[str] = "unknown" # Optional
10
+
11
+ @dataclass
12
+ class NormalizedSearchResult:
13
+ title: str
14
+ url: str # Original URL
15
+ canonical_url: str # Normalized URL
16
+ description: Optional[str] = ""
17
+ source: Optional[str] = "unknown"
18
+
19
+ @dataclass
20
+ class CanonicalResult:
21
+ title: str
22
+ url: str
23
+ canonical_url: str
24
+ description: Optional[str] = ""
25
+ sources: list = field(default_factory=list)
26
+ duplicates_removed: int = 0
27
+
28
+ __all__ = [
29
+ "RawSearchResult",
30
+ "NormalizedSearchResult",
31
+ "CanonicalResult"
32
+ ]
File without changes
File without changes