nia-sync 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- auth.py +168 -0
- config.py +276 -0
- extractor.py +947 -0
- main.py +632 -0
- nia_sync-0.1.0.dist-info/METADATA +9 -0
- nia_sync-0.1.0.dist-info/RECORD +11 -0
- nia_sync-0.1.0.dist-info/WHEEL +5 -0
- nia_sync-0.1.0.dist-info/entry_points.txt +2 -0
- nia_sync-0.1.0.dist-info/top_level.txt +6 -0
- sync.py +192 -0
- watcher.py +304 -0
extractor.py
ADDED
|
@@ -0,0 +1,947 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Database and folder extractor for Nia Local Sync CLI.
|
|
3
|
+
|
|
4
|
+
Extracts text content from SQLite databases and folders,
|
|
5
|
+
converting them into virtual "files" for indexing.
|
|
6
|
+
|
|
7
|
+
Supported types:
|
|
8
|
+
- iMessage (~/Library/Messages/chat.db)
|
|
9
|
+
- Safari History (~/Library/Safari/History.db)
|
|
10
|
+
- Chrome/Brave/Edge History
|
|
11
|
+
- Firefox History (places.sqlite)
|
|
12
|
+
- Telegram (JSON export)
|
|
13
|
+
- Regular folders
|
|
14
|
+
"""
|
|
15
|
+
import os
|
|
16
|
+
import re
|
|
17
|
+
import json
|
|
18
|
+
import sqlite3
|
|
19
|
+
import zipfile
|
|
20
|
+
import logging
|
|
21
|
+
from datetime import datetime, timezone
|
|
22
|
+
from typing import Any
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
# =============================================================================
|
|
28
|
+
# EXCLUSION PATTERNS - Synced with backend/utils/exclusion_patterns.py
|
|
29
|
+
# =============================================================================
|
|
30
|
+
|
|
31
|
+
# Directories to skip entirely (prevents os.walk from descending)
|
|
32
|
+
SKIP_DIRS = {
|
|
33
|
+
# VCS
|
|
34
|
+
".git", ".svn", ".hg", ".bzr",
|
|
35
|
+
# Node/JS
|
|
36
|
+
"node_modules", ".npm", ".pnpm-store", ".yarn", "bower_components",
|
|
37
|
+
".next", ".nuxt", ".output", ".svelte-kit", ".parcel-cache", ".cache", ".turbo",
|
|
38
|
+
# Python
|
|
39
|
+
"__pycache__", "venv", ".venv", "env", ".tox", ".nox",
|
|
40
|
+
".pytest_cache", ".mypy_cache", ".ruff_cache", ".hypothesis", "htmlcov", ".Python",
|
|
41
|
+
# JVM
|
|
42
|
+
"target", ".gradle", ".m2",
|
|
43
|
+
# Rust
|
|
44
|
+
"target",
|
|
45
|
+
# Go
|
|
46
|
+
"vendor",
|
|
47
|
+
# Ruby
|
|
48
|
+
".bundle",
|
|
49
|
+
# .NET
|
|
50
|
+
"bin", "obj", "packages",
|
|
51
|
+
# iOS/macOS
|
|
52
|
+
"DerivedData", "Pods", ".build",
|
|
53
|
+
# Build outputs
|
|
54
|
+
"dist", "build", "out", "output", "release", "debug", "coverage", ".nyc_output",
|
|
55
|
+
# IDE
|
|
56
|
+
".idea", ".vscode", ".atom",
|
|
57
|
+
# OS
|
|
58
|
+
".Spotlight-V100", ".Trashes",
|
|
59
|
+
# Misc
|
|
60
|
+
".terraform", ".vagrant", ".docker", ".kube",
|
|
61
|
+
"logs", "log", "tmp", "temp",
|
|
62
|
+
".aws", ".ssh",
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
# File extensions to skip (from backend exclusion_patterns.py)
|
|
66
|
+
SKIP_EXTENSIONS = {
|
|
67
|
+
# Security - keys/certs
|
|
68
|
+
".pem", ".key", ".p12", ".pfx", ".crt", ".cer",
|
|
69
|
+
# Python compiled
|
|
70
|
+
".pyc", ".pyo", ".pyd", ".egg",
|
|
71
|
+
# JVM
|
|
72
|
+
".class", ".jar", ".war", ".ear",
|
|
73
|
+
# .NET
|
|
74
|
+
".exe", ".pdb", ".nupkg",
|
|
75
|
+
# Compiled binaries
|
|
76
|
+
".so", ".dylib", ".dll", ".o", ".obj", ".a", ".lib", ".wasm",
|
|
77
|
+
# Databases
|
|
78
|
+
".sqlite", ".sqlite3", ".db", ".sql",
|
|
79
|
+
# Images
|
|
80
|
+
".png", ".jpg", ".jpeg", ".gif", ".ico", ".webp", ".bmp", ".tiff", ".tif",
|
|
81
|
+
".psd", ".ai", ".sketch", ".fig",
|
|
82
|
+
# Videos
|
|
83
|
+
".mp4", ".avi", ".mov", ".wmv", ".webm", ".mkv", ".flv",
|
|
84
|
+
# Audio
|
|
85
|
+
".mp3", ".wav", ".ogg", ".flac", ".aac", ".m4a",
|
|
86
|
+
# Documents
|
|
87
|
+
".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx",
|
|
88
|
+
# Archives
|
|
89
|
+
".zip", ".tar", ".gz", ".tgz", ".rar", ".7z", ".bz2", ".xz",
|
|
90
|
+
# Fonts
|
|
91
|
+
".woff", ".woff2", ".ttf", ".otf", ".eot",
|
|
92
|
+
# Logs/temp
|
|
93
|
+
".log", ".tmp", ".temp", ".bak", ".backup", ".old", ".swp", ".swo",
|
|
94
|
+
# Coverage
|
|
95
|
+
".lcov",
|
|
96
|
+
# IDE
|
|
97
|
+
".code-workspace",
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
# Specific filenames to skip (from backend exclusion_patterns.py)
|
|
101
|
+
SKIP_FILES = {
|
|
102
|
+
# Lock files
|
|
103
|
+
"package-lock.json", "yarn.lock", "pnpm-lock.yaml", "bun.lockb",
|
|
104
|
+
"poetry.lock", "Pipfile.lock", "Gemfile.lock", "composer.lock",
|
|
105
|
+
"Cargo.lock", "gradle.lockfile", "Package.resolved",
|
|
106
|
+
# OS files
|
|
107
|
+
".DS_Store", "Thumbs.db", "desktop.ini", "ehthumbs.db",
|
|
108
|
+
# Security - credentials
|
|
109
|
+
".env", ".envrc", ".npmrc", ".pypirc", ".netrc", ".htpasswd",
|
|
110
|
+
# Logs
|
|
111
|
+
"npm-debug.log", "yarn-debug.log", "yarn-error.log", ".pnpm-debug.log",
|
|
112
|
+
"pip-log.txt",
|
|
113
|
+
# IDE
|
|
114
|
+
".project", ".classpath",
|
|
115
|
+
# Python
|
|
116
|
+
".coverage",
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
# Patterns that match anywhere in the path (for files like id_rsa, credentials.json)
|
|
120
|
+
SKIP_PATH_PATTERNS = {
|
|
121
|
+
"credentials", "secrets", ".secret", ".secrets",
|
|
122
|
+
"id_rsa", "id_dsa", "id_ecdsa", "id_ed25519",
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
# Type identifiers
|
|
126
|
+
TYPE_IMESSAGE = "imessage"
|
|
127
|
+
TYPE_SAFARI_HISTORY = "safari_history"
|
|
128
|
+
TYPE_CHROME_HISTORY = "chrome_history"
|
|
129
|
+
TYPE_FIREFOX_HISTORY = "firefox_history"
|
|
130
|
+
TYPE_TELEGRAM = "telegram"
|
|
131
|
+
TYPE_FOLDER = "folder"
|
|
132
|
+
TYPE_GENERIC_DB = "generic"
|
|
133
|
+
|
|
134
|
+
# Limits
|
|
135
|
+
MAX_ROWS = 100_000
|
|
136
|
+
MAX_FILE_SIZE_BYTES = 10 * 1024 * 1024 # 10MB per file
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def detect_source_type(path: str) -> str:
|
|
140
|
+
"""
|
|
141
|
+
Auto-detect the type of source based on path and file structure.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
path: Path to file or directory
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Type identifier string
|
|
148
|
+
"""
|
|
149
|
+
# Check if directory (regular folder or telegram export)
|
|
150
|
+
if os.path.isdir(path):
|
|
151
|
+
if os.path.exists(os.path.join(path, "result.json")):
|
|
152
|
+
return TYPE_TELEGRAM
|
|
153
|
+
return TYPE_FOLDER
|
|
154
|
+
|
|
155
|
+
# Check for Telegram JSON export
|
|
156
|
+
if path.endswith(".json"):
|
|
157
|
+
try:
|
|
158
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
159
|
+
data = json.load(f)
|
|
160
|
+
if "chats" in data and isinstance(data.get("chats"), dict):
|
|
161
|
+
return TYPE_TELEGRAM
|
|
162
|
+
except Exception:
|
|
163
|
+
pass
|
|
164
|
+
|
|
165
|
+
# Check for ZIP (could be Telegram export)
|
|
166
|
+
if zipfile.is_zipfile(path):
|
|
167
|
+
try:
|
|
168
|
+
with zipfile.ZipFile(path, "r") as zf:
|
|
169
|
+
names = zf.namelist()
|
|
170
|
+
if "result.json" in names or any(n.endswith("/result.json") for n in names):
|
|
171
|
+
return TYPE_TELEGRAM
|
|
172
|
+
except Exception:
|
|
173
|
+
pass
|
|
174
|
+
|
|
175
|
+
# Check SQLite databases
|
|
176
|
+
if not os.path.isfile(path):
|
|
177
|
+
return TYPE_FOLDER
|
|
178
|
+
|
|
179
|
+
try:
|
|
180
|
+
conn = sqlite3.connect(path)
|
|
181
|
+
cursor = conn.cursor()
|
|
182
|
+
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
|
|
183
|
+
tables = {row[0].lower() for row in cursor.fetchall()}
|
|
184
|
+
conn.close()
|
|
185
|
+
|
|
186
|
+
# iMessage
|
|
187
|
+
if "message" in tables and "handle" in tables and "chat" in tables:
|
|
188
|
+
return TYPE_IMESSAGE
|
|
189
|
+
|
|
190
|
+
# Safari History
|
|
191
|
+
if "history_items" in tables and "history_visits" in tables:
|
|
192
|
+
return TYPE_SAFARI_HISTORY
|
|
193
|
+
|
|
194
|
+
# Chrome/Brave/Edge History
|
|
195
|
+
if "urls" in tables and "visits" in tables and "keyword_search_terms" in tables:
|
|
196
|
+
return TYPE_CHROME_HISTORY
|
|
197
|
+
|
|
198
|
+
# Firefox History
|
|
199
|
+
if "moz_places" in tables and "moz_historyvisits" in tables:
|
|
200
|
+
return TYPE_FIREFOX_HISTORY
|
|
201
|
+
|
|
202
|
+
return TYPE_GENERIC_DB
|
|
203
|
+
|
|
204
|
+
except Exception:
|
|
205
|
+
return TYPE_FOLDER
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def extract_incremental(
|
|
209
|
+
path: str,
|
|
210
|
+
source_type: str,
|
|
211
|
+
cursor: dict[str, Any] | None = None,
|
|
212
|
+
limit: int = MAX_ROWS,
|
|
213
|
+
) -> dict[str, Any]:
|
|
214
|
+
"""
|
|
215
|
+
Extract data incrementally from a source.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
path: Path to the source
|
|
219
|
+
source_type: Type of source
|
|
220
|
+
cursor: Previous sync cursor (for incremental extraction)
|
|
221
|
+
limit: Maximum items to extract
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
Dict with files, new cursor, and stats
|
|
225
|
+
"""
|
|
226
|
+
cursor = cursor or {}
|
|
227
|
+
|
|
228
|
+
if source_type == TYPE_IMESSAGE:
|
|
229
|
+
return _extract_imessage(path, cursor, limit)
|
|
230
|
+
elif source_type == TYPE_SAFARI_HISTORY:
|
|
231
|
+
return _extract_safari_history(path, cursor, limit)
|
|
232
|
+
elif source_type == TYPE_CHROME_HISTORY:
|
|
233
|
+
return _extract_chrome_history(path, cursor, limit)
|
|
234
|
+
elif source_type == TYPE_FIREFOX_HISTORY:
|
|
235
|
+
return _extract_firefox_history(path, cursor, limit)
|
|
236
|
+
elif source_type == TYPE_TELEGRAM:
|
|
237
|
+
return _extract_telegram(path, cursor, limit)
|
|
238
|
+
elif source_type == TYPE_FOLDER:
|
|
239
|
+
return _extract_folder(path, cursor, limit)
|
|
240
|
+
else:
|
|
241
|
+
return _extract_generic_db(path, cursor, limit)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def _extract_imessage(
|
|
245
|
+
db_path: str,
|
|
246
|
+
cursor: dict[str, Any],
|
|
247
|
+
limit: int,
|
|
248
|
+
) -> dict[str, Any]:
|
|
249
|
+
"""Extract messages from iMessage chat.db."""
|
|
250
|
+
files = []
|
|
251
|
+
max_rowid = cursor.get("last_rowid", 0)
|
|
252
|
+
max_timestamp = cursor.get("last_timestamp", 0)
|
|
253
|
+
since_rowid = cursor.get("last_rowid")
|
|
254
|
+
|
|
255
|
+
conn = sqlite3.connect(db_path)
|
|
256
|
+
conn.row_factory = sqlite3.Row
|
|
257
|
+
cur = conn.cursor()
|
|
258
|
+
|
|
259
|
+
where_clauses = ["m.text IS NOT NULL", "m.text != ''"]
|
|
260
|
+
params = []
|
|
261
|
+
|
|
262
|
+
if since_rowid:
|
|
263
|
+
where_clauses.append("m.ROWID > ?")
|
|
264
|
+
params.append(since_rowid)
|
|
265
|
+
|
|
266
|
+
params.append(limit)
|
|
267
|
+
|
|
268
|
+
query = f"""
|
|
269
|
+
SELECT
|
|
270
|
+
m.ROWID as row_id,
|
|
271
|
+
m.text,
|
|
272
|
+
m.date,
|
|
273
|
+
m.is_from_me,
|
|
274
|
+
m.service,
|
|
275
|
+
h.id as contact_id,
|
|
276
|
+
COALESCE(h.uncanonicalized_id, h.id) as contact_display
|
|
277
|
+
FROM message m
|
|
278
|
+
LEFT JOIN handle h ON m.handle_id = h.ROWID
|
|
279
|
+
WHERE {' AND '.join(where_clauses)}
|
|
280
|
+
ORDER BY m.ROWID ASC
|
|
281
|
+
LIMIT ?
|
|
282
|
+
"""
|
|
283
|
+
|
|
284
|
+
cur.execute(query, params)
|
|
285
|
+
rows = cur.fetchall()
|
|
286
|
+
|
|
287
|
+
for row in rows:
|
|
288
|
+
row_id = row["row_id"]
|
|
289
|
+
text = row["text"]
|
|
290
|
+
apple_date = row["date"]
|
|
291
|
+
is_from_me = row["is_from_me"]
|
|
292
|
+
contact_display = row["contact_display"] or row["contact_id"] or "unknown"
|
|
293
|
+
|
|
294
|
+
max_rowid = max(max_rowid, row_id)
|
|
295
|
+
if apple_date:
|
|
296
|
+
max_timestamp = max(max_timestamp, apple_date)
|
|
297
|
+
|
|
298
|
+
if not text or len(text.strip()) < 2:
|
|
299
|
+
continue
|
|
300
|
+
|
|
301
|
+
# Convert Apple date to ISO
|
|
302
|
+
try:
|
|
303
|
+
if apple_date:
|
|
304
|
+
unix_ts = (apple_date / 1_000_000_000) + 978307200
|
|
305
|
+
dt = datetime.fromtimestamp(unix_ts, tz=timezone.utc)
|
|
306
|
+
timestamp_str = dt.isoformat()
|
|
307
|
+
date_prefix = dt.strftime("%Y-%m-%d")
|
|
308
|
+
else:
|
|
309
|
+
timestamp_str = None
|
|
310
|
+
date_prefix = "unknown"
|
|
311
|
+
except Exception:
|
|
312
|
+
timestamp_str = None
|
|
313
|
+
date_prefix = "unknown"
|
|
314
|
+
|
|
315
|
+
safe_contact = re.sub(r"[^\w\-_]", "_", str(contact_display))[:50]
|
|
316
|
+
direction = "sent" if is_from_me else "received"
|
|
317
|
+
file_path = f"messages/{safe_contact}/{date_prefix}_{row_id}_{direction}.txt"
|
|
318
|
+
|
|
319
|
+
files.append({
|
|
320
|
+
"path": file_path,
|
|
321
|
+
"content": text,
|
|
322
|
+
"metadata": {
|
|
323
|
+
"db_type": TYPE_IMESSAGE,
|
|
324
|
+
"row_id": row_id,
|
|
325
|
+
"timestamp": timestamp_str,
|
|
326
|
+
"contact": contact_display,
|
|
327
|
+
"is_from_me": bool(is_from_me),
|
|
328
|
+
},
|
|
329
|
+
})
|
|
330
|
+
|
|
331
|
+
conn.close()
|
|
332
|
+
logger.info(f"Extracted {len(files)} messages from iMessage")
|
|
333
|
+
|
|
334
|
+
return {
|
|
335
|
+
"files": files,
|
|
336
|
+
"cursor": {"last_rowid": max_rowid, "last_timestamp": max_timestamp},
|
|
337
|
+
"stats": {"extracted": len(files), "db_type": TYPE_IMESSAGE},
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def _extract_safari_history(
|
|
342
|
+
db_path: str,
|
|
343
|
+
cursor: dict[str, Any],
|
|
344
|
+
limit: int,
|
|
345
|
+
) -> dict[str, Any]:
|
|
346
|
+
"""Extract browsing history from Safari History.db."""
|
|
347
|
+
files = []
|
|
348
|
+
max_visit_time = cursor.get("last_visit_time", 0)
|
|
349
|
+
since_visit_time = cursor.get("last_visit_time")
|
|
350
|
+
|
|
351
|
+
conn = sqlite3.connect(db_path)
|
|
352
|
+
conn.row_factory = sqlite3.Row
|
|
353
|
+
cur = conn.cursor()
|
|
354
|
+
|
|
355
|
+
where_clauses = ["hv.title IS NOT NULL", "hv.title != ''"]
|
|
356
|
+
params = []
|
|
357
|
+
|
|
358
|
+
if since_visit_time:
|
|
359
|
+
where_clauses.append("hv.visit_time > ?")
|
|
360
|
+
params.append(since_visit_time)
|
|
361
|
+
|
|
362
|
+
params.append(limit)
|
|
363
|
+
|
|
364
|
+
query = f"""
|
|
365
|
+
SELECT
|
|
366
|
+
hi.id,
|
|
367
|
+
hi.url,
|
|
368
|
+
hi.domain_expansion,
|
|
369
|
+
hv.title,
|
|
370
|
+
hv.visit_time
|
|
371
|
+
FROM history_visits hv
|
|
372
|
+
JOIN history_items hi ON hi.id = hv.history_item
|
|
373
|
+
WHERE {' AND '.join(where_clauses)}
|
|
374
|
+
ORDER BY hv.visit_time ASC
|
|
375
|
+
LIMIT ?
|
|
376
|
+
"""
|
|
377
|
+
|
|
378
|
+
cur.execute(query, params)
|
|
379
|
+
rows = cur.fetchall()
|
|
380
|
+
|
|
381
|
+
for row in rows:
|
|
382
|
+
item_id = row["id"]
|
|
383
|
+
url = row["url"] or ""
|
|
384
|
+
domain = row["domain_expansion"] or ""
|
|
385
|
+
title = row["title"] or ""
|
|
386
|
+
visit_time = row["visit_time"]
|
|
387
|
+
|
|
388
|
+
max_visit_time = max(max_visit_time, visit_time or 0)
|
|
389
|
+
|
|
390
|
+
if not title.strip():
|
|
391
|
+
continue
|
|
392
|
+
|
|
393
|
+
try:
|
|
394
|
+
if visit_time:
|
|
395
|
+
unix_ts = visit_time + 978307200
|
|
396
|
+
dt = datetime.fromtimestamp(unix_ts, tz=timezone.utc)
|
|
397
|
+
timestamp_str = dt.isoformat()
|
|
398
|
+
date_prefix = dt.strftime("%Y-%m-%d")
|
|
399
|
+
else:
|
|
400
|
+
timestamp_str = None
|
|
401
|
+
date_prefix = "unknown"
|
|
402
|
+
except Exception:
|
|
403
|
+
timestamp_str = None
|
|
404
|
+
date_prefix = "unknown"
|
|
405
|
+
|
|
406
|
+
content = f"{title}\n{url}"
|
|
407
|
+
safe_domain = re.sub(r"[^\w\-_]", "_", domain)[:30] if domain else "other"
|
|
408
|
+
file_path = f"history/{safe_domain}/{date_prefix}_{item_id}.txt"
|
|
409
|
+
|
|
410
|
+
files.append({
|
|
411
|
+
"path": file_path,
|
|
412
|
+
"content": content,
|
|
413
|
+
"metadata": {
|
|
414
|
+
"db_type": TYPE_SAFARI_HISTORY,
|
|
415
|
+
"row_id": item_id,
|
|
416
|
+
"timestamp": timestamp_str,
|
|
417
|
+
"url": url,
|
|
418
|
+
"domain": domain,
|
|
419
|
+
},
|
|
420
|
+
})
|
|
421
|
+
|
|
422
|
+
conn.close()
|
|
423
|
+
logger.info(f"Extracted {len(files)} history items from Safari")
|
|
424
|
+
|
|
425
|
+
return {
|
|
426
|
+
"files": files,
|
|
427
|
+
"cursor": {"last_visit_time": max_visit_time},
|
|
428
|
+
"stats": {"extracted": len(files), "db_type": TYPE_SAFARI_HISTORY},
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def _extract_chrome_history(
|
|
433
|
+
db_path: str,
|
|
434
|
+
cursor: dict[str, Any],
|
|
435
|
+
limit: int,
|
|
436
|
+
) -> dict[str, Any]:
|
|
437
|
+
"""Extract browsing history from Chrome/Brave/Edge."""
|
|
438
|
+
files = []
|
|
439
|
+
max_visit_time = cursor.get("last_visit_time", 0)
|
|
440
|
+
since_visit_time = cursor.get("last_visit_time")
|
|
441
|
+
|
|
442
|
+
conn = sqlite3.connect(db_path)
|
|
443
|
+
conn.row_factory = sqlite3.Row
|
|
444
|
+
cur = conn.cursor()
|
|
445
|
+
|
|
446
|
+
where_clauses = ["u.title IS NOT NULL", "u.title != ''"]
|
|
447
|
+
params = []
|
|
448
|
+
|
|
449
|
+
if since_visit_time:
|
|
450
|
+
where_clauses.append("v.visit_time > ?")
|
|
451
|
+
params.append(since_visit_time)
|
|
452
|
+
|
|
453
|
+
params.append(limit)
|
|
454
|
+
|
|
455
|
+
query = f"""
|
|
456
|
+
SELECT
|
|
457
|
+
u.id,
|
|
458
|
+
u.url,
|
|
459
|
+
u.title,
|
|
460
|
+
v.visit_time
|
|
461
|
+
FROM visits v
|
|
462
|
+
JOIN urls u ON u.id = v.url
|
|
463
|
+
WHERE {' AND '.join(where_clauses)}
|
|
464
|
+
ORDER BY v.visit_time ASC
|
|
465
|
+
LIMIT ?
|
|
466
|
+
"""
|
|
467
|
+
|
|
468
|
+
cur.execute(query, params)
|
|
469
|
+
rows = cur.fetchall()
|
|
470
|
+
|
|
471
|
+
for row in rows:
|
|
472
|
+
url_id = row["id"]
|
|
473
|
+
url = row["url"] or ""
|
|
474
|
+
title = row["title"] or ""
|
|
475
|
+
visit_time = row["visit_time"]
|
|
476
|
+
|
|
477
|
+
max_visit_time = max(max_visit_time, visit_time or 0)
|
|
478
|
+
|
|
479
|
+
if not title.strip():
|
|
480
|
+
continue
|
|
481
|
+
|
|
482
|
+
try:
|
|
483
|
+
if visit_time:
|
|
484
|
+
unix_ts = (visit_time / 1_000_000) - 11644473600
|
|
485
|
+
dt = datetime.fromtimestamp(unix_ts, tz=timezone.utc)
|
|
486
|
+
timestamp_str = dt.isoformat()
|
|
487
|
+
date_prefix = dt.strftime("%Y-%m-%d")
|
|
488
|
+
else:
|
|
489
|
+
timestamp_str = None
|
|
490
|
+
date_prefix = "unknown"
|
|
491
|
+
except Exception:
|
|
492
|
+
timestamp_str = None
|
|
493
|
+
date_prefix = "unknown"
|
|
494
|
+
|
|
495
|
+
try:
|
|
496
|
+
from urllib.parse import urlparse
|
|
497
|
+
domain = urlparse(url).netloc or "other"
|
|
498
|
+
except Exception:
|
|
499
|
+
domain = "other"
|
|
500
|
+
|
|
501
|
+
content = f"{title}\n{url}"
|
|
502
|
+
safe_domain = re.sub(r"[^\w\-_]", "_", domain)[:30]
|
|
503
|
+
file_path = f"history/{safe_domain}/{date_prefix}_{url_id}.txt"
|
|
504
|
+
|
|
505
|
+
files.append({
|
|
506
|
+
"path": file_path,
|
|
507
|
+
"content": content,
|
|
508
|
+
"metadata": {
|
|
509
|
+
"db_type": TYPE_CHROME_HISTORY,
|
|
510
|
+
"row_id": url_id,
|
|
511
|
+
"timestamp": timestamp_str,
|
|
512
|
+
"url": url,
|
|
513
|
+
"domain": domain,
|
|
514
|
+
},
|
|
515
|
+
})
|
|
516
|
+
|
|
517
|
+
conn.close()
|
|
518
|
+
logger.info(f"Extracted {len(files)} history items from Chrome")
|
|
519
|
+
|
|
520
|
+
return {
|
|
521
|
+
"files": files,
|
|
522
|
+
"cursor": {"last_visit_time": max_visit_time},
|
|
523
|
+
"stats": {"extracted": len(files), "db_type": TYPE_CHROME_HISTORY},
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
def _extract_firefox_history(
|
|
528
|
+
db_path: str,
|
|
529
|
+
cursor: dict[str, Any],
|
|
530
|
+
limit: int,
|
|
531
|
+
) -> dict[str, Any]:
|
|
532
|
+
"""Extract browsing history from Firefox places.sqlite."""
|
|
533
|
+
files = []
|
|
534
|
+
max_visit_date = cursor.get("last_visit_date", 0)
|
|
535
|
+
since_visit_date = cursor.get("last_visit_date")
|
|
536
|
+
|
|
537
|
+
conn = sqlite3.connect(db_path)
|
|
538
|
+
conn.row_factory = sqlite3.Row
|
|
539
|
+
cur = conn.cursor()
|
|
540
|
+
|
|
541
|
+
where_clauses = ["p.title IS NOT NULL", "p.title != ''"]
|
|
542
|
+
params = []
|
|
543
|
+
|
|
544
|
+
if since_visit_date:
|
|
545
|
+
where_clauses.append("h.visit_date > ?")
|
|
546
|
+
params.append(since_visit_date)
|
|
547
|
+
|
|
548
|
+
params.append(limit)
|
|
549
|
+
|
|
550
|
+
query = f"""
|
|
551
|
+
SELECT
|
|
552
|
+
p.id,
|
|
553
|
+
p.url,
|
|
554
|
+
p.title,
|
|
555
|
+
h.visit_date
|
|
556
|
+
FROM moz_places p
|
|
557
|
+
JOIN moz_historyvisits h ON p.id = h.place_id
|
|
558
|
+
WHERE {' AND '.join(where_clauses)}
|
|
559
|
+
ORDER BY h.visit_date ASC
|
|
560
|
+
LIMIT ?
|
|
561
|
+
"""
|
|
562
|
+
|
|
563
|
+
cur.execute(query, params)
|
|
564
|
+
rows = cur.fetchall()
|
|
565
|
+
|
|
566
|
+
for row in rows:
|
|
567
|
+
place_id = row["id"]
|
|
568
|
+
url = row["url"] or ""
|
|
569
|
+
title = row["title"] or ""
|
|
570
|
+
visit_date = row["visit_date"]
|
|
571
|
+
|
|
572
|
+
max_visit_date = max(max_visit_date, visit_date or 0)
|
|
573
|
+
|
|
574
|
+
if not title.strip():
|
|
575
|
+
continue
|
|
576
|
+
|
|
577
|
+
try:
|
|
578
|
+
if visit_date:
|
|
579
|
+
unix_ts = visit_date / 1_000_000
|
|
580
|
+
dt = datetime.fromtimestamp(unix_ts, tz=timezone.utc)
|
|
581
|
+
timestamp_str = dt.isoformat()
|
|
582
|
+
date_prefix = dt.strftime("%Y-%m-%d")
|
|
583
|
+
else:
|
|
584
|
+
timestamp_str = None
|
|
585
|
+
date_prefix = "unknown"
|
|
586
|
+
except Exception:
|
|
587
|
+
timestamp_str = None
|
|
588
|
+
date_prefix = "unknown"
|
|
589
|
+
|
|
590
|
+
try:
|
|
591
|
+
from urllib.parse import urlparse
|
|
592
|
+
domain = urlparse(url).netloc or "other"
|
|
593
|
+
except Exception:
|
|
594
|
+
domain = "other"
|
|
595
|
+
|
|
596
|
+
content = f"{title}\n{url}"
|
|
597
|
+
safe_domain = re.sub(r"[^\w\-_]", "_", domain)[:30]
|
|
598
|
+
file_path = f"history/{safe_domain}/{date_prefix}_{place_id}.txt"
|
|
599
|
+
|
|
600
|
+
files.append({
|
|
601
|
+
"path": file_path,
|
|
602
|
+
"content": content,
|
|
603
|
+
"metadata": {
|
|
604
|
+
"db_type": TYPE_FIREFOX_HISTORY,
|
|
605
|
+
"row_id": place_id,
|
|
606
|
+
"timestamp": timestamp_str,
|
|
607
|
+
"url": url,
|
|
608
|
+
"domain": domain,
|
|
609
|
+
},
|
|
610
|
+
})
|
|
611
|
+
|
|
612
|
+
conn.close()
|
|
613
|
+
logger.info(f"Extracted {len(files)} history items from Firefox")
|
|
614
|
+
|
|
615
|
+
return {
|
|
616
|
+
"files": files,
|
|
617
|
+
"cursor": {"last_visit_date": max_visit_date},
|
|
618
|
+
"stats": {"extracted": len(files), "db_type": TYPE_FIREFOX_HISTORY},
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
|
|
622
|
+
def _extract_telegram(
|
|
623
|
+
export_path: str,
|
|
624
|
+
cursor: dict[str, Any],
|
|
625
|
+
limit: int,
|
|
626
|
+
) -> dict[str, Any]:
|
|
627
|
+
"""Extract messages from Telegram export (JSON or ZIP)."""
|
|
628
|
+
files = []
|
|
629
|
+
max_message_id = cursor.get("last_message_id", 0)
|
|
630
|
+
since_message_id = cursor.get("last_message_id")
|
|
631
|
+
extracted_count = 0
|
|
632
|
+
|
|
633
|
+
# Handle different input types
|
|
634
|
+
if zipfile.is_zipfile(export_path):
|
|
635
|
+
import tempfile
|
|
636
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
637
|
+
with zipfile.ZipFile(export_path, "r") as zf:
|
|
638
|
+
zf.extractall(temp_dir)
|
|
639
|
+
result_file = os.path.join(temp_dir, "result.json")
|
|
640
|
+
if not os.path.exists(result_file):
|
|
641
|
+
for item in os.listdir(temp_dir):
|
|
642
|
+
subdir = os.path.join(temp_dir, item)
|
|
643
|
+
if os.path.isdir(subdir):
|
|
644
|
+
candidate = os.path.join(subdir, "result.json")
|
|
645
|
+
if os.path.exists(candidate):
|
|
646
|
+
result_file = candidate
|
|
647
|
+
break
|
|
648
|
+
return _extract_telegram_json(result_file, cursor, limit)
|
|
649
|
+
elif os.path.isdir(export_path):
|
|
650
|
+
result_file = os.path.join(export_path, "result.json")
|
|
651
|
+
else:
|
|
652
|
+
result_file = export_path
|
|
653
|
+
|
|
654
|
+
return _extract_telegram_json(result_file, cursor, limit)
|
|
655
|
+
|
|
656
|
+
|
|
657
|
+
def _extract_telegram_json(
|
|
658
|
+
result_file: str,
|
|
659
|
+
cursor: dict[str, Any],
|
|
660
|
+
limit: int,
|
|
661
|
+
) -> dict[str, Any]:
|
|
662
|
+
"""Extract from Telegram result.json file."""
|
|
663
|
+
files = []
|
|
664
|
+
max_message_id = cursor.get("last_message_id", 0)
|
|
665
|
+
since_message_id = cursor.get("last_message_id")
|
|
666
|
+
extracted_count = 0
|
|
667
|
+
|
|
668
|
+
with open(result_file, "r", encoding="utf-8") as f:
|
|
669
|
+
data = json.load(f)
|
|
670
|
+
|
|
671
|
+
chats_data = data.get("chats", {})
|
|
672
|
+
chats = chats_data.get("list", []) if isinstance(chats_data, dict) else []
|
|
673
|
+
|
|
674
|
+
for chat in chats:
|
|
675
|
+
if extracted_count >= limit:
|
|
676
|
+
break
|
|
677
|
+
|
|
678
|
+
chat_name = chat.get("name", "Unknown Chat")
|
|
679
|
+
chat_type = chat.get("type", "personal_chat")
|
|
680
|
+
chat_id = chat.get("id", 0)
|
|
681
|
+
|
|
682
|
+
messages = chat.get("messages", [])
|
|
683
|
+
|
|
684
|
+
for msg in messages:
|
|
685
|
+
if extracted_count >= limit:
|
|
686
|
+
break
|
|
687
|
+
|
|
688
|
+
msg_id = msg.get("id")
|
|
689
|
+
msg_type = msg.get("type", "message")
|
|
690
|
+
|
|
691
|
+
if since_message_id and msg_id and msg_id <= since_message_id:
|
|
692
|
+
continue
|
|
693
|
+
|
|
694
|
+
if msg_type != "message":
|
|
695
|
+
continue
|
|
696
|
+
|
|
697
|
+
# Handle text (can be string or list)
|
|
698
|
+
text_content = msg.get("text", "")
|
|
699
|
+
if isinstance(text_content, list):
|
|
700
|
+
parts = []
|
|
701
|
+
for item in text_content:
|
|
702
|
+
if isinstance(item, dict):
|
|
703
|
+
parts.append(item.get("text", ""))
|
|
704
|
+
elif isinstance(item, str):
|
|
705
|
+
parts.append(item)
|
|
706
|
+
text_content = "".join(parts)
|
|
707
|
+
|
|
708
|
+
if not text_content or not text_content.strip():
|
|
709
|
+
continue
|
|
710
|
+
|
|
711
|
+
if msg_id:
|
|
712
|
+
max_message_id = max(max_message_id, msg_id)
|
|
713
|
+
|
|
714
|
+
date_str = msg.get("date", "")
|
|
715
|
+
try:
|
|
716
|
+
if date_str:
|
|
717
|
+
dt = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
|
|
718
|
+
timestamp_str = dt.isoformat()
|
|
719
|
+
date_prefix = dt.strftime("%Y-%m-%d")
|
|
720
|
+
else:
|
|
721
|
+
timestamp_str = None
|
|
722
|
+
date_prefix = "unknown"
|
|
723
|
+
except Exception:
|
|
724
|
+
timestamp_str = date_str
|
|
725
|
+
date_prefix = "unknown"
|
|
726
|
+
|
|
727
|
+
from_name = msg.get("from", "") or msg.get("actor", "") or "Unknown"
|
|
728
|
+
|
|
729
|
+
safe_chat = re.sub(r"[^\w\-_]", "_", chat_name)[:50]
|
|
730
|
+
file_path = f"telegram/{safe_chat}/{date_prefix}_{msg_id}.txt"
|
|
731
|
+
|
|
732
|
+
files.append({
|
|
733
|
+
"path": file_path,
|
|
734
|
+
"content": text_content,
|
|
735
|
+
"metadata": {
|
|
736
|
+
"db_type": TYPE_TELEGRAM,
|
|
737
|
+
"chat_name": chat_name,
|
|
738
|
+
"chat_type": chat_type,
|
|
739
|
+
"message_id": msg_id,
|
|
740
|
+
"timestamp": timestamp_str,
|
|
741
|
+
"from_name": from_name,
|
|
742
|
+
},
|
|
743
|
+
})
|
|
744
|
+
extracted_count += 1
|
|
745
|
+
|
|
746
|
+
logger.info(f"Extracted {len(files)} messages from Telegram")
|
|
747
|
+
|
|
748
|
+
return {
|
|
749
|
+
"files": files,
|
|
750
|
+
"cursor": {"last_message_id": max_message_id},
|
|
751
|
+
"stats": {"extracted": len(files), "db_type": TYPE_TELEGRAM},
|
|
752
|
+
}
|
|
753
|
+
|
|
754
|
+
|
|
755
|
+
def _extract_folder(
|
|
756
|
+
folder_path: str,
|
|
757
|
+
cursor: dict[str, Any],
|
|
758
|
+
limit: int,
|
|
759
|
+
) -> dict[str, Any]:
|
|
760
|
+
"""Extract text files from a regular folder with proper exclusion patterns."""
|
|
761
|
+
files = []
|
|
762
|
+
last_mtime = cursor.get("last_mtime", 0)
|
|
763
|
+
max_mtime = last_mtime
|
|
764
|
+
extracted_count = 0
|
|
765
|
+
|
|
766
|
+
# Allowed text file extensions
|
|
767
|
+
text_extensions = {
|
|
768
|
+
".txt", ".md", ".py", ".js", ".ts", ".tsx", ".jsx", ".json", ".yaml", ".yml",
|
|
769
|
+
".html", ".css", ".scss", ".less", ".xml", ".csv", ".sh", ".bash", ".zsh",
|
|
770
|
+
".rs", ".go", ".java", ".c", ".cpp", ".h", ".hpp", ".rb", ".vue", ".svelte",
|
|
771
|
+
".php", ".swift", ".kt", ".scala", ".r", ".sql", ".toml", ".ini", ".cfg",
|
|
772
|
+
".makefile", ".dockerfile", ".gitignore", ".editorconfig",
|
|
773
|
+
}
|
|
774
|
+
|
|
775
|
+
for root, dirs, filenames in os.walk(folder_path, topdown=True):
|
|
776
|
+
if extracted_count >= limit:
|
|
777
|
+
break
|
|
778
|
+
|
|
779
|
+
# Filter out excluded directories IN-PLACE to prevent os.walk from descending
|
|
780
|
+
dirs[:] = [
|
|
781
|
+
d for d in dirs
|
|
782
|
+
if d not in SKIP_DIRS
|
|
783
|
+
and not d.startswith(".")
|
|
784
|
+
and not d.endswith(".egg-info")
|
|
785
|
+
]
|
|
786
|
+
|
|
787
|
+
for filename in filenames:
|
|
788
|
+
if extracted_count >= limit:
|
|
789
|
+
break
|
|
790
|
+
|
|
791
|
+
# Skip by filename
|
|
792
|
+
if filename in SKIP_FILES:
|
|
793
|
+
continue
|
|
794
|
+
|
|
795
|
+
# Skip hidden files
|
|
796
|
+
if filename.startswith("."):
|
|
797
|
+
continue
|
|
798
|
+
|
|
799
|
+
# Skip files matching security patterns (credentials, secrets, keys)
|
|
800
|
+
filename_lower = filename.lower()
|
|
801
|
+
if any(pattern in filename_lower for pattern in SKIP_PATH_PATTERNS):
|
|
802
|
+
continue
|
|
803
|
+
|
|
804
|
+
ext = Path(filename).suffix.lower()
|
|
805
|
+
|
|
806
|
+
# Skip by extension
|
|
807
|
+
if ext in SKIP_EXTENSIONS:
|
|
808
|
+
continue
|
|
809
|
+
|
|
810
|
+
# Only include known text extensions
|
|
811
|
+
if ext and ext not in text_extensions:
|
|
812
|
+
continue
|
|
813
|
+
|
|
814
|
+
file_path = os.path.join(root, filename)
|
|
815
|
+
|
|
816
|
+
try:
|
|
817
|
+
stat = os.stat(file_path)
|
|
818
|
+
mtime = stat.st_mtime
|
|
819
|
+
|
|
820
|
+
# Skip if not modified since last sync
|
|
821
|
+
if mtime <= last_mtime:
|
|
822
|
+
continue
|
|
823
|
+
|
|
824
|
+
# Skip large files
|
|
825
|
+
if stat.st_size > MAX_FILE_SIZE_BYTES:
|
|
826
|
+
continue
|
|
827
|
+
|
|
828
|
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
|
829
|
+
content = f.read()
|
|
830
|
+
|
|
831
|
+
if not content.strip():
|
|
832
|
+
continue
|
|
833
|
+
|
|
834
|
+
max_mtime = max(max_mtime, mtime)
|
|
835
|
+
|
|
836
|
+
# Relative path from folder root
|
|
837
|
+
rel_path = os.path.relpath(file_path, folder_path)
|
|
838
|
+
|
|
839
|
+
files.append({
|
|
840
|
+
"path": rel_path,
|
|
841
|
+
"content": content,
|
|
842
|
+
"metadata": {
|
|
843
|
+
"db_type": TYPE_FOLDER,
|
|
844
|
+
"extension": ext,
|
|
845
|
+
"mtime": mtime,
|
|
846
|
+
},
|
|
847
|
+
})
|
|
848
|
+
extracted_count += 1
|
|
849
|
+
|
|
850
|
+
except (PermissionError, IOError) as e:
|
|
851
|
+
logger.warning(f"Could not read {file_path}: {e}")
|
|
852
|
+
continue
|
|
853
|
+
|
|
854
|
+
logger.info(f"Extracted {len(files)} files from folder")
|
|
855
|
+
|
|
856
|
+
return {
|
|
857
|
+
"files": files,
|
|
858
|
+
"cursor": {"last_mtime": max_mtime},
|
|
859
|
+
"stats": {"extracted": len(files), "db_type": TYPE_FOLDER},
|
|
860
|
+
}
|
|
861
|
+
|
|
862
|
+
|
|
863
|
+
def _extract_generic_db(
|
|
864
|
+
db_path: str,
|
|
865
|
+
cursor: dict[str, Any],
|
|
866
|
+
limit: int,
|
|
867
|
+
) -> dict[str, Any]:
|
|
868
|
+
"""Extract from generic SQLite database."""
|
|
869
|
+
files = []
|
|
870
|
+
total_extracted = 0
|
|
871
|
+
|
|
872
|
+
skip_tables = {"sqlite_sequence", "sqlite_stat1", "sqlite_stat4"}
|
|
873
|
+
|
|
874
|
+
conn = sqlite3.connect(db_path)
|
|
875
|
+
cur = conn.cursor()
|
|
876
|
+
|
|
877
|
+
cur.execute("SELECT name FROM sqlite_master WHERE type='table'")
|
|
878
|
+
tables = [row[0] for row in cur.fetchall()]
|
|
879
|
+
|
|
880
|
+
for table_name in tables:
|
|
881
|
+
if table_name.lower() in skip_tables:
|
|
882
|
+
continue
|
|
883
|
+
|
|
884
|
+
if total_extracted >= limit:
|
|
885
|
+
break
|
|
886
|
+
|
|
887
|
+
try:
|
|
888
|
+
cur.execute(f'PRAGMA table_info("{table_name}")')
|
|
889
|
+
columns = cur.fetchall()
|
|
890
|
+
|
|
891
|
+
text_columns = [
|
|
892
|
+
col[1]
|
|
893
|
+
for col in columns
|
|
894
|
+
if col[2].upper() in ("TEXT", "VARCHAR", "CHAR", "CLOB")
|
|
895
|
+
]
|
|
896
|
+
|
|
897
|
+
if not text_columns:
|
|
898
|
+
continue
|
|
899
|
+
|
|
900
|
+
pk_column = next((col[1] for col in columns if col[5] == 1), "rowid")
|
|
901
|
+
|
|
902
|
+
select_cols = [f'"{pk_column}"'] + [f'"{col}"' for col in text_columns]
|
|
903
|
+
remaining = limit - total_extracted
|
|
904
|
+
query = f'SELECT {", ".join(select_cols)} FROM "{table_name}" LIMIT ?'
|
|
905
|
+
|
|
906
|
+
cur.execute(query, (remaining,))
|
|
907
|
+
rows = cur.fetchall()
|
|
908
|
+
|
|
909
|
+
for row in rows:
|
|
910
|
+
pk_value = row[0]
|
|
911
|
+
text_values = row[1:]
|
|
912
|
+
|
|
913
|
+
combined = []
|
|
914
|
+
for col_name, value in zip(text_columns, text_values):
|
|
915
|
+
if value and isinstance(value, str) and value.strip():
|
|
916
|
+
combined.append(f"{col_name}: {value}")
|
|
917
|
+
|
|
918
|
+
if not combined:
|
|
919
|
+
continue
|
|
920
|
+
|
|
921
|
+
content = "\n".join(combined)
|
|
922
|
+
safe_table = re.sub(r"[^\w\-_]", "_", table_name)
|
|
923
|
+
file_path = f"{safe_table}/row_{pk_value}.txt"
|
|
924
|
+
|
|
925
|
+
files.append({
|
|
926
|
+
"path": file_path,
|
|
927
|
+
"content": content,
|
|
928
|
+
"metadata": {
|
|
929
|
+
"db_type": TYPE_GENERIC_DB,
|
|
930
|
+
"table": table_name,
|
|
931
|
+
"row_id": pk_value,
|
|
932
|
+
},
|
|
933
|
+
})
|
|
934
|
+
total_extracted += 1
|
|
935
|
+
|
|
936
|
+
except Exception as e:
|
|
937
|
+
logger.warning(f"Error extracting from table {table_name}: {e}")
|
|
938
|
+
continue
|
|
939
|
+
|
|
940
|
+
conn.close()
|
|
941
|
+
logger.info(f"Extracted {len(files)} rows from generic database")
|
|
942
|
+
|
|
943
|
+
return {
|
|
944
|
+
"files": files,
|
|
945
|
+
"cursor": {},
|
|
946
|
+
"stats": {"extracted": len(files), "db_type": TYPE_GENERIC_DB},
|
|
947
|
+
}
|