gibsondedup 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gibsondedup-0.1.0/PKG-INFO +322 -0
- gibsondedup-0.1.0/README.md +309 -0
- gibsondedup-0.1.0/gibsondedup/__init__.py +0 -0
- gibsondedup-0.1.0/gibsondedup/api/__init__.py +0 -0
- gibsondedup-0.1.0/gibsondedup/cache/__init__.py +0 -0
- gibsondedup-0.1.0/gibsondedup/canonicalizer/__init__.py +0 -0
- gibsondedup-0.1.0/gibsondedup/canonicalizer/canonicalizer.py +24 -0
- gibsondedup-0.1.0/gibsondedup/config/settings.py +0 -0
- gibsondedup-0.1.0/gibsondedup/contracts/__init__.py +0 -0
- gibsondedup-0.1.0/gibsondedup/contracts/models.py +32 -0
- gibsondedup-0.1.0/gibsondedup/errors/__init__.py +0 -0
- gibsondedup-0.1.0/gibsondedup/grouping/__init__.py +0 -0
- gibsondedup-0.1.0/gibsondedup/grouping/grouping.py +17 -0
- gibsondedup-0.1.0/gibsondedup/grouping/merger.py +77 -0
- gibsondedup-0.1.0/gibsondedup/main.py +4 -0
- gibsondedup-0.1.0/gibsondedup/metrics/__init__.py +0 -0
- gibsondedup-0.1.0/gibsondedup/normalizer/__init__.py +0 -0
- gibsondedup-0.1.0/gibsondedup/normalizer/similarity.py +20 -0
- gibsondedup-0.1.0/gibsondedup/normalizer/url_normalizer.py +52 -0
- gibsondedup-0.1.0/gibsondedup/orchestration/__init__.py +0 -0
- gibsondedup-0.1.0/gibsondedup/orchestration/pipeline.py +62 -0
- gibsondedup-0.1.0/gibsondedup/parser/__init__.py +0 -0
- gibsondedup-0.1.0/gibsondedup/parser/json_parser.py +39 -0
- gibsondedup-0.1.0/gibsondedup.egg-info/PKG-INFO +322 -0
- gibsondedup-0.1.0/gibsondedup.egg-info/SOURCES.txt +37 -0
- gibsondedup-0.1.0/gibsondedup.egg-info/dependency_links.txt +1 -0
- gibsondedup-0.1.0/gibsondedup.egg-info/entry_points.txt +2 -0
- gibsondedup-0.1.0/gibsondedup.egg-info/top_level.txt +5 -0
- gibsondedup-0.1.0/pyproject.toml +26 -0
- gibsondedup-0.1.0/setup.cfg +4 -0
- gibsondedup-0.1.0/setup.py +16 -0
- gibsondedup-0.1.0/tests/__init__.py +0 -0
- gibsondedup-0.1.0/tests/fixtures.py +0 -0
- gibsondedup-0.1.0/tests/test_canonicalizer.py +131 -0
- gibsondedup-0.1.0/tests/test_grouping.py +50 -0
- gibsondedup-0.1.0/tests/test_merger.py +117 -0
- gibsondedup-0.1.0/tests/test_normalizer.py +55 -0
- gibsondedup-0.1.0/tests/test_pipeline.py +141 -0
- gibsondedup-0.1.0/tests/test_similarity.py +61 -0
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gibsondedup
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A high-performance search result deduplication engine that groups duplicate results by canonical URL and semantic similarity, returning clean batches with the most authoritative source and full traceability to all original sources.
|
|
5
|
+
Author: Gibson Kwabena Aseda Mensah
|
|
6
|
+
Author-email: logosyninc@gmail.com
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Requires-Python: >=3.8
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Dynamic: author-email
|
|
12
|
+
Dynamic: requires-python
|
|
13
|
+
|
|
14
|
+
# gibsondedup
|
|
15
|
+
|
|
16
|
+
A high-performance search result deduplication engine that groups duplicate results by canonical URL and semantic similarity, returning clean batches with the most authoritative source and full traceability to all original sources.
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## The Problem
|
|
21
|
+
|
|
22
|
+
When aggregating search results from multiple providers (Google, Bing, DuckDuckGo), 40–60% of results are duplicates — same content, different URLs, different titles, different tracking parameters.
|
|
23
|
+
|
|
24
|
+
```
|
|
25
|
+
Source 1 (Google):
|
|
26
|
+
url: https://www.python.org/docs?utm_source=google
|
|
27
|
+
title: "Python Official Documentation"
|
|
28
|
+
|
|
29
|
+
Source 2 (Bing):
|
|
30
|
+
url: http://python.org/docs/
|
|
31
|
+
title: "Python Docs"
|
|
32
|
+
|
|
33
|
+
Source 3 (DuckDuckGo):
|
|
34
|
+
url: https://python.org/docs?utm_medium=cpc
|
|
35
|
+
title: "Learn Python - Official Docs"
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
These are the same resource. Without deduplication, users see noise instead of signal.
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
## The Solution
|
|
43
|
+
|
|
44
|
+
`gibsondedup` processes raw search results through a multi-stage pipeline:
|
|
45
|
+
|
|
46
|
+
```
|
|
47
|
+
Raw Results
|
|
48
|
+
↓
|
|
49
|
+
Parse & Validate (malformed records skipped gracefully)
|
|
50
|
+
↓
|
|
51
|
+
URL Normalization (remove tracking params, www, trailing slashes)
|
|
52
|
+
↓
|
|
53
|
+
Exact URL Grouping (hash-based, O(n) complexity)
|
|
54
|
+
↓
|
|
55
|
+
Semantic Merge (Jaccard similarity, configurable threshold)
|
|
56
|
+
↓
|
|
57
|
+
Canonicalization (select best result per group)
|
|
58
|
+
↓
|
|
59
|
+
Clean Output (deduplicated results + metadata)
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
**Result:** 100 noisy inputs → 60–70 clean, canonical results.
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
|
|
66
|
+
## Installation
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
pip install gibsondedup
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
## Quick Start
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from app.orchestration.pipeline import DeduplicationEngine
|
|
78
|
+
|
|
79
|
+
engine = DeduplicationEngine()
|
|
80
|
+
|
|
81
|
+
results = engine.process([
|
|
82
|
+
{
|
|
83
|
+
"title": "Python Official Documentation",
|
|
84
|
+
"url": "https://www.python.org/docs?utm_source=google",
|
|
85
|
+
"description": "Official Python docs",
|
|
86
|
+
"source": "google"
|
|
87
|
+
},
|
|
88
|
+
{
|
|
89
|
+
"title": "Python Docs",
|
|
90
|
+
"url": "http://python.org/docs/",
|
|
91
|
+
"description": "Python documentation",
|
|
92
|
+
"source": "bing"
|
|
93
|
+
},
|
|
94
|
+
{
|
|
95
|
+
"title": "Stack Overflow Python",
|
|
96
|
+
"url": "https://stackoverflow.com/questions/tagged/python",
|
|
97
|
+
"description": "Python questions",
|
|
98
|
+
"source": "google"
|
|
99
|
+
},
|
|
100
|
+
])
|
|
101
|
+
|
|
102
|
+
print(results)
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
**Output:**
|
|
106
|
+
|
|
107
|
+
```json
|
|
108
|
+
{
|
|
109
|
+
"results": [
|
|
110
|
+
{
|
|
111
|
+
"title": "Python Official Documentation",
|
|
112
|
+
"url": "https://www.python.org/docs?utm_source=google",
|
|
113
|
+
"canonical_url": "python.org/docs",
|
|
114
|
+
"description": "Official Python docs",
|
|
115
|
+
"sources": ["google", "bing"],
|
|
116
|
+
"duplicates_removed": 1
|
|
117
|
+
},
|
|
118
|
+
{
|
|
119
|
+
"title": "Stack Overflow Python",
|
|
120
|
+
"url": "https://stackoverflow.com/questions/tagged/python",
|
|
121
|
+
"canonical_url": "stackoverflow.com/questions/tagged/python",
|
|
122
|
+
"description": "Python questions",
|
|
123
|
+
"sources": ["google"],
|
|
124
|
+
"duplicates_removed": 0
|
|
125
|
+
}
|
|
126
|
+
],
|
|
127
|
+
"meta": {
|
|
128
|
+
"total_input": 3,
|
|
129
|
+
"total_parsed": 3,
|
|
130
|
+
"total_output": 2,
|
|
131
|
+
"duplicates_removed": 1,
|
|
132
|
+
"processing_time_ms": 0.09,
|
|
133
|
+
"similarity_threshold": 0.8
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
---
|
|
139
|
+
|
|
140
|
+
## Configuration
|
|
141
|
+
|
|
142
|
+
### Custom Similarity Threshold
|
|
143
|
+
|
|
144
|
+
Control how aggressively similar-titled results are merged:
|
|
145
|
+
|
|
146
|
+
```python
|
|
147
|
+
# Conservative (default) — only merge highly similar titles
|
|
148
|
+
engine = DeduplicationEngine(similarity_threshold=0.8)
|
|
149
|
+
|
|
150
|
+
# Moderate — merge titles with moderate overlap
|
|
151
|
+
engine = DeduplicationEngine(similarity_threshold=0.5)
|
|
152
|
+
|
|
153
|
+
# Aggressive — merge titles with minimal overlap
|
|
154
|
+
engine = DeduplicationEngine(similarity_threshold=0.3)
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### Input Contract
|
|
158
|
+
|
|
159
|
+
Each result in the input array accepts:
|
|
160
|
+
|
|
161
|
+
| Field | Type | Required | Description |
|
|
162
|
+
|-------|------|----------|-------------|
|
|
163
|
+
| `title` | string | ✅ | Result title |
|
|
164
|
+
| `url` | string | ✅ | Result URL |
|
|
165
|
+
| `description` | string | ❌ | Result description |
|
|
166
|
+
| `source` | string | ❌ | Source provider (google, bing, etc.) |
|
|
167
|
+
|
|
168
|
+
Malformed records (missing title or URL) are **skipped gracefully** — the pipeline continues processing valid records.
|
|
169
|
+
|
|
170
|
+
---
|
|
171
|
+
|
|
172
|
+
## Architecture
|
|
173
|
+
|
|
174
|
+
### Pipeline Stages
|
|
175
|
+
|
|
176
|
+
**Stage 1: Parser**
|
|
177
|
+
Converts raw JSON payloads into structured internal contracts. Validates required fields. Skips malformed records without crashing.
|
|
178
|
+
|
|
179
|
+
**Stage 2: URL Normalizer**
|
|
180
|
+
Converts URLs to canonical form by:
|
|
181
|
+
- Removing protocol (`http://`, `https://`)
|
|
182
|
+
- Removing `www` prefix
|
|
183
|
+
- Removing tracking parameters (`utm_*`, `fbclid`, `gclid`)
|
|
184
|
+
- Removing trailing slashes
|
|
185
|
+
- Stripping default ports (80, 443)
|
|
186
|
+
- Preserving meaningful query parameters
|
|
187
|
+
- Lowercasing everything
|
|
188
|
+
|
|
189
|
+
**Stage 3: Exact Grouping**
|
|
190
|
+
Groups results by normalized canonical URL using a hash map. Time complexity: O(n). Avoids the O(n²) pairwise comparison trap.
|
|
191
|
+
|
|
192
|
+
**Stage 4: Semantic Merger**
|
|
193
|
+
Within groups sharing the same domain, compares representative titles using Jaccard similarity. Groups exceeding the similarity threshold are merged into one. Only compares within the same exact domain — `python.org` and `docs.python.org` are treated as separate domains.
|
|
194
|
+
|
|
195
|
+
**Stage 5: Canonicalizer**
|
|
196
|
+
For each group, selects the single best result using a combined title + description length heuristic. Collects source traceability from all merged results.
|
|
197
|
+
|
|
198
|
+
### Data Contracts
|
|
199
|
+
|
|
200
|
+
```python
|
|
201
|
+
# Input
|
|
202
|
+
RawSearchResult(title, url, description?, source?)
|
|
203
|
+
|
|
204
|
+
# Internal
|
|
205
|
+
NormalizedSearchResult(title, url, canonical_url, description, source)
|
|
206
|
+
|
|
207
|
+
# Output
|
|
208
|
+
CanonicalResult(title, url, canonical_url, description, sources[], duplicates_removed)
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
### System Invariants
|
|
212
|
+
|
|
213
|
+
1. Same normalized URL → same duplicate group (always)
|
|
214
|
+
2. Canonical results preserve source traceability (always)
|
|
215
|
+
3. Pipeline stages do not mutate upstream data (always)
|
|
216
|
+
4. Normalization happens once per result (always)
|
|
217
|
+
|
|
218
|
+
---
|
|
219
|
+
|
|
220
|
+
## Engineering Decisions
|
|
221
|
+
|
|
222
|
+
### Why hash-based grouping (O(n)) over pairwise comparison (O(n²))?
|
|
223
|
+
|
|
224
|
+
With 1000 results, O(n²) means 1,000,000 comparisons. O(n) means 1000. At scale this is the difference between milliseconds and seconds.
|
|
225
|
+
|
|
226
|
+
### Why exact domain matching for semantic merging?
|
|
227
|
+
|
|
228
|
+
`python.org/docs` and `docs.python.org` serve genuinely different content despite sharing a base domain. Exact domain matching prevents false merges while still catching real duplicates like `python.org/docs` and `python.org/reference`.
|
|
229
|
+
|
|
230
|
+
### Why Jaccard similarity over Levenshtein distance?
|
|
231
|
+
|
|
232
|
+
Jaccard operates on token sets (words), making it robust to word order changes and additions. Levenshtein operates on character sequences, making it sensitive to trivial differences like "Docs" vs "Documentation".
|
|
233
|
+
|
|
234
|
+
### Why keep normalization deterministic?
|
|
235
|
+
|
|
236
|
+
Same input must always produce same output. This ensures:
|
|
237
|
+
- Results are reproducible across runs
|
|
238
|
+
- Caching normalized URLs is safe
|
|
239
|
+
- Tests are reliable and meaningful
|
|
240
|
+
|
|
241
|
+
---
|
|
242
|
+
|
|
243
|
+
## Performance
|
|
244
|
+
|
|
245
|
+
| Input Size | Processing Time | Memory |
|
|
246
|
+
|------------|----------------|--------|
|
|
247
|
+
| 100 results | ~1ms | ~1MB |
|
|
248
|
+
| 1,000 results | ~10ms | ~5MB |
|
|
249
|
+
| 10,000 results | ~100ms | ~50MB |
|
|
250
|
+
|
|
251
|
+
*Benchmarked on Ubuntu 22.04, Python 3.10, Intel i5*
|
|
252
|
+
|
|
253
|
+
---
|
|
254
|
+
|
|
255
|
+
## Testing
|
|
256
|
+
|
|
257
|
+
```bash
|
|
258
|
+
# Install dev dependencies
|
|
259
|
+
pip install pytest pytest-cov
|
|
260
|
+
|
|
261
|
+
# Run all tests
|
|
262
|
+
pytest tests/ -v
|
|
263
|
+
|
|
264
|
+
# Run with coverage
|
|
265
|
+
pytest tests/ --cov=app --cov-report=term-missing
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
**Test coverage: 59 tests across 5 modules**
|
|
269
|
+
|
|
270
|
+
```
|
|
271
|
+
tests/test_normalizer.py 14 tests (URL normalization)
|
|
272
|
+
tests/test_grouping.py 4 tests (exact grouping)
|
|
273
|
+
tests/test_canonicalizer.py 7 tests (best result selection)
|
|
274
|
+
tests/test_merger.py 11 tests (semantic merging)
|
|
275
|
+
tests/test_similarity.py 13 tests (Jaccard similarity)
|
|
276
|
+
tests/test_pipeline.py 10 tests (end-to-end pipeline)
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
---
|
|
280
|
+
|
|
281
|
+
## Roadmap
|
|
282
|
+
|
|
283
|
+
### Phase 1 (Complete)
|
|
284
|
+
- URL normalization
|
|
285
|
+
- Exact duplicate grouping
|
|
286
|
+
- Canonicalization with source traceability
|
|
287
|
+
|
|
288
|
+
### Phase 2 (Complete)
|
|
289
|
+
- Jaccard similarity engine
|
|
290
|
+
- Domain-based semantic merging
|
|
291
|
+
- Configurable similarity threshold
|
|
292
|
+
|
|
293
|
+
### Phase 3 (Planned)
|
|
294
|
+
- Persistent caching (Redis)
|
|
295
|
+
- Database storage (PostgreSQL)
|
|
296
|
+
- REST API exposure
|
|
297
|
+
- Rails wrapper (gem)
|
|
298
|
+
|
|
299
|
+
### Phase 4 (Planned)
|
|
300
|
+
- Semantic embeddings (replace Jaccard with vector similarity)
|
|
301
|
+
- Distributed processing
|
|
302
|
+
- Production observability
|
|
303
|
+
|
|
304
|
+
---
|
|
305
|
+
|
|
306
|
+
## License
|
|
307
|
+
|
|
308
|
+
MIT License. See `LICENSE` file.
|
|
309
|
+
|
|
310
|
+
---
|
|
311
|
+
|
|
312
|
+
## Author
|
|
313
|
+
|
|
314
|
+
**Aseda Gibson**
|
|
315
|
+
Computer Engineering, University of Energy and Natural Resources (UENR), Ghana.
|
|
316
|
+
Backend systems, distributed architecture, infrastructure tooling.
|
|
317
|
+
|
|
318
|
+
GitHub: [github.com/asedagibson](https://github.com/asedagibson)
|
|
319
|
+
|
|
320
|
+
---
|
|
321
|
+
|
|
322
|
+
*Built with correctness-oriented engineering. Every architectural decision is documented. Every invariant is tested.*
|
|
@@ -0,0 +1,309 @@
|
|
|
1
|
+
# gibsondedup
|
|
2
|
+
|
|
3
|
+
A high-performance search result deduplication engine that groups duplicate results by canonical URL and semantic similarity, returning clean batches with the most authoritative source and full traceability to all original sources.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## The Problem
|
|
8
|
+
|
|
9
|
+
When aggregating search results from multiple providers (Google, Bing, DuckDuckGo), 40–60% of results are duplicates — same content, different URLs, different titles, different tracking parameters.
|
|
10
|
+
|
|
11
|
+
```
|
|
12
|
+
Source 1 (Google):
|
|
13
|
+
url: https://www.python.org/docs?utm_source=google
|
|
14
|
+
title: "Python Official Documentation"
|
|
15
|
+
|
|
16
|
+
Source 2 (Bing):
|
|
17
|
+
url: http://python.org/docs/
|
|
18
|
+
title: "Python Docs"
|
|
19
|
+
|
|
20
|
+
Source 3 (DuckDuckGo):
|
|
21
|
+
url: https://python.org/docs?utm_medium=cpc
|
|
22
|
+
title: "Learn Python - Official Docs"
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
These are the same resource. Without deduplication, users see noise instead of signal.
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## The Solution
|
|
30
|
+
|
|
31
|
+
`gibsondedup` processes raw search results through a multi-stage pipeline:
|
|
32
|
+
|
|
33
|
+
```
|
|
34
|
+
Raw Results
|
|
35
|
+
↓
|
|
36
|
+
Parse & Validate (malformed records skipped gracefully)
|
|
37
|
+
↓
|
|
38
|
+
URL Normalization (remove tracking params, www, trailing slashes)
|
|
39
|
+
↓
|
|
40
|
+
Exact URL Grouping (hash-based, O(n) complexity)
|
|
41
|
+
↓
|
|
42
|
+
Semantic Merge (Jaccard similarity, configurable threshold)
|
|
43
|
+
↓
|
|
44
|
+
Canonicalization (select best result per group)
|
|
45
|
+
↓
|
|
46
|
+
Clean Output (deduplicated results + metadata)
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
**Result:** 100 noisy inputs → 60–70 clean, canonical results.
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## Installation
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
pip install gibsondedup
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## Quick Start
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
from app.orchestration.pipeline import DeduplicationEngine
|
|
65
|
+
|
|
66
|
+
engine = DeduplicationEngine()
|
|
67
|
+
|
|
68
|
+
results = engine.process([
|
|
69
|
+
{
|
|
70
|
+
"title": "Python Official Documentation",
|
|
71
|
+
"url": "https://www.python.org/docs?utm_source=google",
|
|
72
|
+
"description": "Official Python docs",
|
|
73
|
+
"source": "google"
|
|
74
|
+
},
|
|
75
|
+
{
|
|
76
|
+
"title": "Python Docs",
|
|
77
|
+
"url": "http://python.org/docs/",
|
|
78
|
+
"description": "Python documentation",
|
|
79
|
+
"source": "bing"
|
|
80
|
+
},
|
|
81
|
+
{
|
|
82
|
+
"title": "Stack Overflow Python",
|
|
83
|
+
"url": "https://stackoverflow.com/questions/tagged/python",
|
|
84
|
+
"description": "Python questions",
|
|
85
|
+
"source": "google"
|
|
86
|
+
},
|
|
87
|
+
])
|
|
88
|
+
|
|
89
|
+
print(results)
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
**Output:**
|
|
93
|
+
|
|
94
|
+
```json
|
|
95
|
+
{
|
|
96
|
+
"results": [
|
|
97
|
+
{
|
|
98
|
+
"title": "Python Official Documentation",
|
|
99
|
+
"url": "https://www.python.org/docs?utm_source=google",
|
|
100
|
+
"canonical_url": "python.org/docs",
|
|
101
|
+
"description": "Official Python docs",
|
|
102
|
+
"sources": ["google", "bing"],
|
|
103
|
+
"duplicates_removed": 1
|
|
104
|
+
},
|
|
105
|
+
{
|
|
106
|
+
"title": "Stack Overflow Python",
|
|
107
|
+
"url": "https://stackoverflow.com/questions/tagged/python",
|
|
108
|
+
"canonical_url": "stackoverflow.com/questions/tagged/python",
|
|
109
|
+
"description": "Python questions",
|
|
110
|
+
"sources": ["google"],
|
|
111
|
+
"duplicates_removed": 0
|
|
112
|
+
}
|
|
113
|
+
],
|
|
114
|
+
"meta": {
|
|
115
|
+
"total_input": 3,
|
|
116
|
+
"total_parsed": 3,
|
|
117
|
+
"total_output": 2,
|
|
118
|
+
"duplicates_removed": 1,
|
|
119
|
+
"processing_time_ms": 0.09,
|
|
120
|
+
"similarity_threshold": 0.8
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
---
|
|
126
|
+
|
|
127
|
+
## Configuration
|
|
128
|
+
|
|
129
|
+
### Custom Similarity Threshold
|
|
130
|
+
|
|
131
|
+
Control how aggressively similar-titled results are merged:
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
# Conservative (default) — only merge highly similar titles
|
|
135
|
+
engine = DeduplicationEngine(similarity_threshold=0.8)
|
|
136
|
+
|
|
137
|
+
# Moderate — merge titles with moderate overlap
|
|
138
|
+
engine = DeduplicationEngine(similarity_threshold=0.5)
|
|
139
|
+
|
|
140
|
+
# Aggressive — merge titles with minimal overlap
|
|
141
|
+
engine = DeduplicationEngine(similarity_threshold=0.3)
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Input Contract
|
|
145
|
+
|
|
146
|
+
Each result in the input array accepts:
|
|
147
|
+
|
|
148
|
+
| Field | Type | Required | Description |
|
|
149
|
+
|-------|------|----------|-------------|
|
|
150
|
+
| `title` | string | ✅ | Result title |
|
|
151
|
+
| `url` | string | ✅ | Result URL |
|
|
152
|
+
| `description` | string | ❌ | Result description |
|
|
153
|
+
| `source` | string | ❌ | Source provider (google, bing, etc.) |
|
|
154
|
+
|
|
155
|
+
Malformed records (missing title or URL) are **skipped gracefully** — the pipeline continues processing valid records.
|
|
156
|
+
|
|
157
|
+
---
|
|
158
|
+
|
|
159
|
+
## Architecture
|
|
160
|
+
|
|
161
|
+
### Pipeline Stages
|
|
162
|
+
|
|
163
|
+
**Stage 1: Parser**
|
|
164
|
+
Converts raw JSON payloads into structured internal contracts. Validates required fields. Skips malformed records without crashing.
|
|
165
|
+
|
|
166
|
+
**Stage 2: URL Normalizer**
|
|
167
|
+
Converts URLs to canonical form by:
|
|
168
|
+
- Removing protocol (`http://`, `https://`)
|
|
169
|
+
- Removing `www` prefix
|
|
170
|
+
- Removing tracking parameters (`utm_*`, `fbclid`, `gclid`)
|
|
171
|
+
- Removing trailing slashes
|
|
172
|
+
- Stripping default ports (80, 443)
|
|
173
|
+
- Preserving meaningful query parameters
|
|
174
|
+
- Lowercasing everything
|
|
175
|
+
|
|
176
|
+
**Stage 3: Exact Grouping**
|
|
177
|
+
Groups results by normalized canonical URL using a hash map. Time complexity: O(n). Avoids the O(n²) pairwise comparison trap.
|
|
178
|
+
|
|
179
|
+
**Stage 4: Semantic Merger**
|
|
180
|
+
Within groups sharing the same domain, compares representative titles using Jaccard similarity. Groups exceeding the similarity threshold are merged into one. Only compares within the same exact domain — `python.org` and `docs.python.org` are treated as separate domains.
|
|
181
|
+
|
|
182
|
+
**Stage 5: Canonicalizer**
|
|
183
|
+
For each group, selects the single best result using a combined title + description length heuristic. Collects source traceability from all merged results.
|
|
184
|
+
|
|
185
|
+
### Data Contracts
|
|
186
|
+
|
|
187
|
+
```python
|
|
188
|
+
# Input
|
|
189
|
+
RawSearchResult(title, url, description?, source?)
|
|
190
|
+
|
|
191
|
+
# Internal
|
|
192
|
+
NormalizedSearchResult(title, url, canonical_url, description, source)
|
|
193
|
+
|
|
194
|
+
# Output
|
|
195
|
+
CanonicalResult(title, url, canonical_url, description, sources[], duplicates_removed)
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
### System Invariants
|
|
199
|
+
|
|
200
|
+
1. Same normalized URL → same duplicate group (always)
|
|
201
|
+
2. Canonical results preserve source traceability (always)
|
|
202
|
+
3. Pipeline stages do not mutate upstream data (always)
|
|
203
|
+
4. Normalization happens once per result (always)
|
|
204
|
+
|
|
205
|
+
---
|
|
206
|
+
|
|
207
|
+
## Engineering Decisions
|
|
208
|
+
|
|
209
|
+
### Why hash-based grouping (O(n)) over pairwise comparison (O(n²))?
|
|
210
|
+
|
|
211
|
+
With 1000 results, O(n²) means 1,000,000 comparisons. O(n) means 1000. At scale this is the difference between milliseconds and seconds.
|
|
212
|
+
|
|
213
|
+
### Why exact domain matching for semantic merging?
|
|
214
|
+
|
|
215
|
+
`python.org/docs` and `docs.python.org` serve genuinely different content despite sharing a base domain. Exact domain matching prevents false merges while still catching real duplicates like `python.org/docs` and `python.org/reference`.
|
|
216
|
+
|
|
217
|
+
### Why Jaccard similarity over Levenshtein distance?
|
|
218
|
+
|
|
219
|
+
Jaccard operates on token sets (words), making it robust to word order changes and additions. Levenshtein operates on character sequences, making it sensitive to trivial differences like "Docs" vs "Documentation".
|
|
220
|
+
|
|
221
|
+
### Why keep normalization deterministic?
|
|
222
|
+
|
|
223
|
+
Same input must always produce same output. This ensures:
|
|
224
|
+
- Results are reproducible across runs
|
|
225
|
+
- Caching normalized URLs is safe
|
|
226
|
+
- Tests are reliable and meaningful
|
|
227
|
+
|
|
228
|
+
---
|
|
229
|
+
|
|
230
|
+
## Performance
|
|
231
|
+
|
|
232
|
+
| Input Size | Processing Time | Memory |
|
|
233
|
+
|------------|----------------|--------|
|
|
234
|
+
| 100 results | ~1ms | ~1MB |
|
|
235
|
+
| 1,000 results | ~10ms | ~5MB |
|
|
236
|
+
| 10,000 results | ~100ms | ~50MB |
|
|
237
|
+
|
|
238
|
+
*Benchmarked on Ubuntu 22.04, Python 3.10, Intel i5*
|
|
239
|
+
|
|
240
|
+
---
|
|
241
|
+
|
|
242
|
+
## Testing
|
|
243
|
+
|
|
244
|
+
```bash
|
|
245
|
+
# Install dev dependencies
|
|
246
|
+
pip install pytest pytest-cov
|
|
247
|
+
|
|
248
|
+
# Run all tests
|
|
249
|
+
pytest tests/ -v
|
|
250
|
+
|
|
251
|
+
# Run with coverage
|
|
252
|
+
pytest tests/ --cov=app --cov-report=term-missing
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
**Test coverage: 59 tests across 5 modules**
|
|
256
|
+
|
|
257
|
+
```
|
|
258
|
+
tests/test_normalizer.py 14 tests (URL normalization)
|
|
259
|
+
tests/test_grouping.py 4 tests (exact grouping)
|
|
260
|
+
tests/test_canonicalizer.py 7 tests (best result selection)
|
|
261
|
+
tests/test_merger.py 11 tests (semantic merging)
|
|
262
|
+
tests/test_similarity.py 13 tests (Jaccard similarity)
|
|
263
|
+
tests/test_pipeline.py 10 tests (end-to-end pipeline)
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
---
|
|
267
|
+
|
|
268
|
+
## Roadmap
|
|
269
|
+
|
|
270
|
+
### Phase 1 (Complete)
|
|
271
|
+
- URL normalization
|
|
272
|
+
- Exact duplicate grouping
|
|
273
|
+
- Canonicalization with source traceability
|
|
274
|
+
|
|
275
|
+
### Phase 2 (Complete)
|
|
276
|
+
- Jaccard similarity engine
|
|
277
|
+
- Domain-based semantic merging
|
|
278
|
+
- Configurable similarity threshold
|
|
279
|
+
|
|
280
|
+
### Phase 3 (Planned)
|
|
281
|
+
- Persistent caching (Redis)
|
|
282
|
+
- Database storage (PostgreSQL)
|
|
283
|
+
- REST API exposure
|
|
284
|
+
- Rails wrapper (gem)
|
|
285
|
+
|
|
286
|
+
### Phase 4 (Planned)
|
|
287
|
+
- Semantic embeddings (replace Jaccard with vector similarity)
|
|
288
|
+
- Distributed processing
|
|
289
|
+
- Production observability
|
|
290
|
+
|
|
291
|
+
---
|
|
292
|
+
|
|
293
|
+
## License
|
|
294
|
+
|
|
295
|
+
MIT License. See `LICENSE` file.
|
|
296
|
+
|
|
297
|
+
---
|
|
298
|
+
|
|
299
|
+
## Author
|
|
300
|
+
|
|
301
|
+
**Aseda Gibson**
|
|
302
|
+
Computer Engineering, University of Energy and Natural Resources (UENR), Ghana.
|
|
303
|
+
Backend systems, distributed architecture, infrastructure tooling.
|
|
304
|
+
|
|
305
|
+
GitHub: [github.com/asedagibson](https://github.com/asedagibson)
|
|
306
|
+
|
|
307
|
+
---
|
|
308
|
+
|
|
309
|
+
*Built with correctness-oriented engineering. Every architectural decision is documented. Every invariant is tested.*
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from gibsondedup.contracts.models import NormalizedSearchResult, CanonicalResult
|
|
2
|
+
from typing import List, Dict
|
|
3
|
+
|
|
4
|
+
def canonicalize(groups: Dict[str, List[NormalizedSearchResult]]) -> List[CanonicalResult]:
|
|
5
|
+
canonical_results = []
|
|
6
|
+
for canonical_url, results in groups.items():
|
|
7
|
+
if not results:
|
|
8
|
+
continue
|
|
9
|
+
# Find the result with the longest description to use as the canonical result and longest title
|
|
10
|
+
canonical_result = max(results, key=lambda r: len(r.description or "") + len(r.title or ""))
|
|
11
|
+
# Create a CanonicalResult with the sources and count of duplicates removed
|
|
12
|
+
canonical_results.append(CanonicalResult(
|
|
13
|
+
title=canonical_result.title,
|
|
14
|
+
url=canonical_result.url,
|
|
15
|
+
canonical_url=canonical_result.canonical_url,
|
|
16
|
+
description=canonical_result.description,
|
|
17
|
+
sources=[r.source for r in results],
|
|
18
|
+
duplicates_removed=len(results) - 1
|
|
19
|
+
))
|
|
20
|
+
return canonical_results
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"canonicalize"
|
|
24
|
+
]
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
@dataclass
|
|
5
|
+
class RawSearchResult:
|
|
6
|
+
title: str # Required
|
|
7
|
+
url: str # Required
|
|
8
|
+
description: Optional[str] = "" # Optional
|
|
9
|
+
source: Optional[str] = "unknown" # Optional
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class NormalizedSearchResult:
|
|
13
|
+
title: str
|
|
14
|
+
url: str # Original URL
|
|
15
|
+
canonical_url: str # Normalized URL
|
|
16
|
+
description: Optional[str] = ""
|
|
17
|
+
source: Optional[str] = "unknown"
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class CanonicalResult:
|
|
21
|
+
title: str
|
|
22
|
+
url: str
|
|
23
|
+
canonical_url: str
|
|
24
|
+
description: Optional[str] = ""
|
|
25
|
+
sources: list = field(default_factory=list)
|
|
26
|
+
duplicates_removed: int = 0
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"RawSearchResult",
|
|
30
|
+
"NormalizedSearchResult",
|
|
31
|
+
"CanonicalResult"
|
|
32
|
+
]
|
|
File without changes
|
|
File without changes
|