footprinter-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. footprinter/__init__.py +8 -0
  2. footprinter/access.py +444 -0
  3. footprinter/api/__init__.py +1 -0
  4. footprinter/api/db.py +61 -0
  5. footprinter/api/entities.py +250 -0
  6. footprinter/api/search.py +47 -0
  7. footprinter/api/semantic.py +33 -0
  8. footprinter/api/server.py +66 -0
  9. footprinter/api/status.py +15 -0
  10. footprinter/bundled/__init__.py +0 -0
  11. footprinter/bundled/config.example.yaml +161 -0
  12. footprinter/bundled/patterns/context_patterns.yaml +18 -0
  13. footprinter/bundled/patterns/extensions.yaml +283 -0
  14. footprinter/bundled/patterns/filename_patterns.yaml +61 -0
  15. footprinter/bundled/patterns/mime_mappings.yaml +68 -0
  16. footprinter/bundled/patterns/salesforce_rules.yaml +84 -0
  17. footprinter/bundled/patterns/security_patterns.yaml +27 -0
  18. footprinter/cli/__init__.py +128 -0
  19. footprinter/cli/__main__.py +6 -0
  20. footprinter/cli/_common.py +332 -0
  21. footprinter/cli/_policy_helpers.py +646 -0
  22. footprinter/cli/_prompt.py +220 -0
  23. footprinter/cli/api_cmd.py +32 -0
  24. footprinter/cli/connect.py +591 -0
  25. footprinter/cli/data.py +879 -0
  26. footprinter/cli/delete.py +128 -0
  27. footprinter/cli/ingest.py +579 -0
  28. footprinter/cli/mcp_cmd.py +750 -0
  29. footprinter/cli/mcp_setup.py +306 -0
  30. footprinter/cli/search.py +393 -0
  31. footprinter/cli/search_cmd.py +69 -0
  32. footprinter/cli/setup.py +1836 -0
  33. footprinter/cli/status.py +729 -0
  34. footprinter/cli/status_cmd.py +104 -0
  35. footprinter/cli/upsert.py +794 -0
  36. footprinter/cli/vectorize_cmd.py +215 -0
  37. footprinter/cli/view.py +322 -0
  38. footprinter/connectors/__init__.py +171 -0
  39. footprinter/connectors/config_utils.py +141 -0
  40. footprinter/db/__init__.py +37 -0
  41. footprinter/db/browser.py +198 -0
  42. footprinter/db/chats.py +610 -0
  43. footprinter/db/clients.py +307 -0
  44. footprinter/db/emails.py +279 -0
  45. footprinter/db/files.py +741 -0
  46. footprinter/db/folders.py +659 -0
  47. footprinter/db/messages.py +192 -0
  48. footprinter/db/policies.py +151 -0
  49. footprinter/db/projects.py +673 -0
  50. footprinter/db/search.py +573 -0
  51. footprinter/db/sql_utils.py +168 -0
  52. footprinter/db/status.py +320 -0
  53. footprinter/db/uploads.py +70 -0
  54. footprinter/ingest/__init__.py +0 -0
  55. footprinter/ingest/adapters/__init__.py +33 -0
  56. footprinter/ingest/adapters/browser.py +54 -0
  57. footprinter/ingest/adapters/chat.py +57 -0
  58. footprinter/ingest/adapters/ingest.py +146 -0
  59. footprinter/ingest/adapters/local_files.py +68 -0
  60. footprinter/ingest/adapters/local_folders.py +52 -0
  61. footprinter/ingest/adapters/protocol.py +174 -0
  62. footprinter/ingest/browser_indexer.py +216 -0
  63. footprinter/ingest/chat_dedup.py +156 -0
  64. footprinter/ingest/chat_indexer.py +515 -0
  65. footprinter/ingest/chat_parsers/__init__.py +8 -0
  66. footprinter/ingest/chat_parsers/chatgpt_parser.py +229 -0
  67. footprinter/ingest/chat_parsers/claude_parser.py +161 -0
  68. footprinter/ingest/cli.py +827 -0
  69. footprinter/ingest/content_extractors.py +117 -0
  70. footprinter/ingest/database.py +36 -0
  71. footprinter/ingest/db/__init__.py +1 -0
  72. footprinter/ingest/db/connector_schema.py +47 -0
  73. footprinter/ingest/db/migration.py +328 -0
  74. footprinter/ingest/db/schema.py +1043 -0
  75. footprinter/ingest/db/security.py +6 -0
  76. footprinter/ingest/file_indexer.py +261 -0
  77. footprinter/ingest/file_scanner.py +277 -0
  78. footprinter/ingest/folder_indexer.py +226 -0
  79. footprinter/ingest/full_content_extractor.py +321 -0
  80. footprinter/ingest/orchestrator.py +125 -0
  81. footprinter/ingest/pipe_runner.py +217 -0
  82. footprinter/ingest/processing.py +165 -0
  83. footprinter/ingest/registry.py +201 -0
  84. footprinter/ingest/run_record.py +91 -0
  85. footprinter/ingest/status.py +346 -0
  86. footprinter/mcp/__init__.py +0 -0
  87. footprinter/mcp/__main__.py +5 -0
  88. footprinter/mcp/db.py +57 -0
  89. footprinter/mcp/errors.py +102 -0
  90. footprinter/mcp/extraction.py +226 -0
  91. footprinter/mcp/server.py +39 -0
  92. footprinter/mcp/tools/__init__.py +0 -0
  93. footprinter/mcp/tools/navigation.py +70 -0
  94. footprinter/mcp/tools/read.py +75 -0
  95. footprinter/mcp/tools/search.py +158 -0
  96. footprinter/mcp/tools/semantic.py +79 -0
  97. footprinter/mcp/tools/status.py +15 -0
  98. footprinter/paths.py +91 -0
  99. footprinter/permissions.py +1160 -0
  100. footprinter/semantic/__init__.py +13 -0
  101. footprinter/semantic/chunking.py +52 -0
  102. footprinter/semantic/embeddings.py +23 -0
  103. footprinter/semantic/hybrid_search.py +273 -0
  104. footprinter/semantic/vector_store.py +471 -0
  105. footprinter/services/__init__.py +49 -0
  106. footprinter/services/access_service.py +342 -0
  107. footprinter/services/chat_service.py +85 -0
  108. footprinter/services/client_service.py +267 -0
  109. footprinter/services/content_service.py +181 -0
  110. footprinter/services/email_service.py +89 -0
  111. footprinter/services/file_service.py +83 -0
  112. footprinter/services/folder_service.py +122 -0
  113. footprinter/services/includes.py +19 -0
  114. footprinter/services/ingest_service.py +231 -0
  115. footprinter/services/project_service.py +262 -0
  116. footprinter/services/roles.py +25 -0
  117. footprinter/services/search_service.py +177 -0
  118. footprinter/services/semantic_service.py +360 -0
  119. footprinter/services/status_service.py +18 -0
  120. footprinter/services/visit_service.py +65 -0
  121. footprinter/source_registry.py +194 -0
  122. footprinter/utils/__init__.py +7 -0
  123. footprinter/utils/hash_utils.py +59 -0
  124. footprinter/utils/logging_config.py +68 -0
  125. footprinter/utils/mime.py +30 -0
  126. footprinter/utils/text.py +6 -0
  127. footprinter/utils/time.py +11 -0
  128. footprinter/visibility.py +1272 -0
  129. footprinter_cli-1.0.0.dist-info/LICENSE +21 -0
  130. footprinter_cli-1.0.0.dist-info/METADATA +229 -0
  131. footprinter_cli-1.0.0.dist-info/RECORD +134 -0
  132. footprinter_cli-1.0.0.dist-info/WHEEL +5 -0
  133. footprinter_cli-1.0.0.dist-info/entry_points.txt +2 -0
  134. footprinter_cli-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,216 @@
1
+ """
2
+ Browser parsers for Safari and Chrome.
3
+ """
4
+
5
+ import logging
6
+ import platform
7
+ import shutil
8
+ import sqlite3
9
+ import tempfile
10
+ from datetime import datetime, timedelta, timezone
11
+ from pathlib import Path
12
+ from typing import Dict, Generator
13
+
14
+ from footprinter.utils.time import UTC_FMT
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class BrowserParser:
20
+ """Base class for browser history parsing."""
21
+
22
+ def __init__(self, lookback_days: int = 14, since: datetime | None = None):
23
+ self.lookback_days = lookback_days
24
+ # Ensure cutoff is tz-aware UTC for comparison with tz-aware epoch constants
25
+ if since is not None:
26
+ self.cutoff_date = since.astimezone(timezone.utc) if since.tzinfo else since.replace(tzinfo=timezone.utc)
27
+ else:
28
+ self.cutoff_date = datetime.now(timezone.utc) - timedelta(days=lookback_days)
29
+
30
+ def parse(self) -> Generator[Dict, None, None]:
31
+ """Parse browser history. To be implemented by subclasses."""
32
+ raise NotImplementedError
33
+
34
+
35
+ class SafariParser(BrowserParser):
36
+ """Parse Safari browser history."""
37
+
38
+ def __init__(self, lookback_days: int = 14, since: datetime | None = None):
39
+ super().__init__(lookback_days, since=since)
40
+ if platform.system() != "Darwin":
41
+ self.history_db_path = None
42
+ else:
43
+ self.history_db_path = Path.home() / "Library" / "Safari" / "History.db"
44
+
45
+ def parse(self) -> Generator[Dict, None, None]:
46
+ """Parse Safari history from SQLite database."""
47
+ if self.history_db_path is None:
48
+ logger.warning(
49
+ "Safari history parsing skipped (unsupported platform: %s)",
50
+ platform.system(),
51
+ )
52
+ return
53
+ if not self.history_db_path.exists():
54
+ logger.warning(f"Safari history not found at {self.history_db_path}")
55
+ return
56
+
57
+ # Safari's History.db may be locked, so copy it first
58
+ with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp_file:
59
+ tmp_path = tmp_file.name
60
+
61
+ conn = None
62
+ try:
63
+ shutil.copy2(self.history_db_path, tmp_path)
64
+
65
+ conn = sqlite3.connect(tmp_path)
66
+ conn.row_factory = sqlite3.Row
67
+ cursor = conn.cursor()
68
+
69
+ # Safari stores visit time as seconds since 2001-01-01 UTC (Core Data timestamp)
70
+ core_data_epoch = datetime(2001, 1, 1, tzinfo=timezone.utc)
71
+ cutoff_timestamp = (self.cutoff_date - core_data_epoch).total_seconds()
72
+
73
+ query = """
74
+ SELECT
75
+ hv.visit_time,
76
+ hi.url,
77
+ hi.title
78
+ FROM history_visits hv
79
+ JOIN history_items hi ON hv.history_item = hi.id
80
+ WHERE hv.visit_time > ?
81
+ ORDER BY hv.visit_time DESC
82
+ """
83
+
84
+ cursor.execute(query, (cutoff_timestamp,))
85
+
86
+ for row in cursor:
87
+ # Convert Safari's Core Data timestamp to datetime
88
+ visit_time = core_data_epoch + timedelta(seconds=row["visit_time"])
89
+
90
+ yield {
91
+ "url": row["url"],
92
+ "title": row["title"],
93
+ "visit_time": visit_time.strftime(UTC_FMT),
94
+ "browser": "safari",
95
+ "visit_count": 1,
96
+ }
97
+
98
+ except Exception as e:
99
+ logger.error(f"Error parsing Safari history: {e}")
100
+ finally:
101
+ if conn:
102
+ conn.close()
103
+ # Clean up temp file
104
+ try:
105
+ Path(tmp_path).unlink()
106
+ except OSError:
107
+ pass
108
+
109
+
110
+ class ChromeParser(BrowserParser):
111
+ """Parse Chrome browser history."""
112
+
113
+ def __init__(self, lookback_days: int = 14, since: datetime | None = None):
114
+ super().__init__(lookback_days, since=since)
115
+ system = platform.system()
116
+ if system == "Darwin":
117
+ self.history_db_path = (
118
+ Path.home() / "Library" / "Application Support" / "Google" / "Chrome" / "Default" / "History"
119
+ )
120
+ elif system == "Linux":
121
+ self.history_db_path = Path.home() / ".config" / "google-chrome" / "Default" / "History"
122
+ else:
123
+ self.history_db_path = None
124
+
125
+ def parse(self) -> Generator[Dict, None, None]:
126
+ """Parse Chrome history from SQLite database."""
127
+ if self.history_db_path is None:
128
+ logger.warning(
129
+ "Chrome history parsing skipped (unsupported platform: %s)",
130
+ platform.system(),
131
+ )
132
+ return
133
+ if not self.history_db_path.exists():
134
+ logger.warning(f"Chrome history not found at {self.history_db_path}")
135
+ return
136
+
137
+ # Chrome's History may be locked, so copy it first
138
+ with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp_file:
139
+ tmp_path = tmp_file.name
140
+
141
+ conn = None
142
+ try:
143
+ shutil.copy2(self.history_db_path, tmp_path)
144
+
145
+ conn = sqlite3.connect(tmp_path)
146
+ conn.row_factory = sqlite3.Row
147
+ cursor = conn.cursor()
148
+
149
+ # Chrome stores time as microseconds since 1601-01-01 UTC (Windows epoch)
150
+ chrome_epoch = datetime(1601, 1, 1, tzinfo=timezone.utc)
151
+ cutoff_timestamp = int((self.cutoff_date - chrome_epoch).total_seconds() * 1_000_000)
152
+
153
+ query = """
154
+ SELECT
155
+ urls.url,
156
+ urls.title,
157
+ urls.visit_count,
158
+ visits.visit_time
159
+ FROM urls
160
+ LEFT JOIN visits ON urls.id = visits.url
161
+ WHERE visits.visit_time > ?
162
+ ORDER BY visits.visit_time DESC
163
+ """
164
+
165
+ cursor.execute(query, (cutoff_timestamp,))
166
+
167
+ for row in cursor:
168
+ # Convert Chrome's timestamp to datetime
169
+ visit_time = chrome_epoch + timedelta(microseconds=row["visit_time"])
170
+
171
+ yield {
172
+ "url": row["url"],
173
+ "title": row["title"],
174
+ "visit_time": visit_time.strftime(UTC_FMT),
175
+ "browser": "chrome",
176
+ "visit_count": row["visit_count"] or 1,
177
+ }
178
+
179
+ except Exception as e:
180
+ logger.error(f"Error parsing Chrome history: {e}")
181
+ finally:
182
+ if conn:
183
+ conn.close()
184
+ # Clean up temp file
185
+ try:
186
+ Path(tmp_path).unlink()
187
+ except OSError:
188
+ pass
189
+
190
+
191
+ class BrowserManager:
192
+ """Manage parsing of multiple browsers."""
193
+
194
+ def __init__(self, config: Dict, since: datetime | None = None):
195
+ self.config = config
196
+ self.lookback_days = config.get("indexing", {}).get("lookback_days", 14)
197
+ self.browsers = config.get("browsers", [])
198
+ self.since = since
199
+
200
+ def parse_all(self) -> Generator[Dict, None, None]:
201
+ """Parse history from all configured browsers."""
202
+ for browser in self.browsers:
203
+ browser_lower = browser.lower()
204
+
205
+ if browser_lower == "safari":
206
+ parser = SafariParser(self.lookback_days, since=self.since)
207
+ logger.info("Parsing Safari history...")
208
+ yield from parser.parse()
209
+
210
+ elif browser_lower == "chrome":
211
+ parser = ChromeParser(self.lookback_days, since=self.since)
212
+ logger.info("Parsing Chrome history...")
213
+ yield from parser.parse()
214
+
215
+ else:
216
+ logger.warning(f"Unknown browser: {browser}")
@@ -0,0 +1,156 @@
1
+ """Chat dedup detection and merge.
2
+
3
+ Orchestrates near-duplicate chat detection via db.chats and merges
4
+ duplicates by combining unique messages from source into target,
5
+ marking the source as status='merged', and updating vector embeddings.
6
+ """
7
+
8
+ import hashlib
9
+ import logging
10
+ from dataclasses import dataclass
11
+ from typing import Any, Dict, List, Optional
12
+
13
+ from footprinter.db import chats as chats_db
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ @dataclass
19
+ class DuplicateGroup:
20
+ """A group of chats detected as potential duplicates."""
21
+
22
+ reason: str # 'exact_title', 'fuzzy_title', 'message_overlap'
23
+ confidence: str # 'high', 'medium'
24
+ chats: List[Dict] # list of chat dicts
25
+ detail: str = "" # human-readable explanation
26
+
27
+
28
+ class ChatDedup:
29
+ """Duplicate detection and merge for chats."""
30
+
31
+ def __init__(self, db):
32
+ self.db = db
33
+ self._hash_cache: Dict[int, List[str]] = {}
34
+
35
+ def _get_hashes(self, chat_id: int) -> List[str]:
36
+ """Get message content hashes, with caching."""
37
+ if chat_id not in self._hash_cache:
38
+ self._hash_cache[chat_id] = chats_db.get_chat_message_hashes(self.db.conn, chat_id)
39
+ return self._hash_cache[chat_id]
40
+
41
+ def detect_duplicates(self) -> List[DuplicateGroup]:
42
+ """Detect potential duplicate chats.
43
+
44
+ Delegates to ``footprinter.db.chats.detect_duplicates`` and
45
+ converts plain dicts back to ``DuplicateGroup`` dataclasses.
46
+ """
47
+ from footprinter.db.chats import detect_duplicates as _detect
48
+
49
+ raw_groups = _detect(self.db.conn)
50
+ return [
51
+ DuplicateGroup(
52
+ reason=g["reason"],
53
+ confidence=g["confidence"],
54
+ chats=g["chats"],
55
+ detail=g["detail"],
56
+ )
57
+ for g in raw_groups
58
+ ]
59
+
60
+ def merge(
61
+ self,
62
+ target_id: int,
63
+ source_id: int,
64
+ vector_store: Optional[Any] = None,
65
+ ) -> Dict:
66
+ """Merge source chat into target.
67
+
68
+ 1. Validate both exist and aren't already merged
69
+ 2. Hash target's messages
70
+ 3. Identify unique messages in source
71
+ 4. Move unique messages to target
72
+ 5. Recount target's message_count
73
+ 6. Mark source as merged
74
+ 7. Update vectors if vector_store provided
75
+
76
+ Returns dict with merge stats.
77
+ """
78
+ if target_id == source_id:
79
+ raise ValueError("Cannot merge a chat into itself")
80
+
81
+ target = chats_db.get_chat_by_id(self.db.conn, target_id)
82
+ source = chats_db.get_chat_by_id(self.db.conn, source_id)
83
+
84
+ if not target:
85
+ raise ValueError(f"Target chat {target_id} not found")
86
+ if not source:
87
+ raise ValueError(f"Source chat {source_id} not found")
88
+ if target.get("status") == "merged":
89
+ raise ValueError(f"Target chat {target_id} is already merged")
90
+ if source.get("status") == "merged":
91
+ raise ValueError(f"Source chat {source_id} is already merged")
92
+
93
+ # Hash target's messages to identify what's already there
94
+ target_hashes = set(self._get_hashes(target_id))
95
+
96
+ # Find unique messages in source (not already in target)
97
+ source_messages = chats_db.get_chat_messages(self.db.conn, source_id)
98
+ unique_message_ids = []
99
+ duplicate_count = 0
100
+ for msg in source_messages:
101
+ content = msg["content"] or ""
102
+ msg_hash = hashlib.sha256(content.encode("utf-8")).hexdigest()
103
+ if msg_hash not in target_hashes:
104
+ unique_message_ids.append(msg["id"])
105
+ else:
106
+ duplicate_count += 1
107
+
108
+ # Move unique messages to target
109
+ moved = 0
110
+ if unique_message_ids:
111
+ moved = chats_db.move_messages_to_chat(self.db.conn, source_id, target_id, unique_message_ids)
112
+
113
+ # Recount target's messages
114
+ new_count = chats_db.update_chat_message_count(self.db.conn, target_id)
115
+
116
+ # Mark source as merged
117
+ chats_db.mark_chat_merged(self.db.conn, source_id, target_id)
118
+
119
+ # Commit the entire merge atomically (move + recount + mark)
120
+ self.db.conn.commit()
121
+
122
+ # Invalidate hash cache
123
+ self._hash_cache.pop(target_id, None)
124
+ self._hash_cache.pop(source_id, None)
125
+
126
+ # Update vectors if store provided
127
+ vectors_updated = False
128
+ if vector_store:
129
+ try:
130
+ # Delete source chat vectors
131
+ vector_store.delete_by_metadata({"chat_id": source_id})
132
+ # Re-index moved messages under target
133
+ # (Caller is responsible for full re-vectorization)
134
+ vectors_updated = True
135
+ except Exception as e:
136
+ logger.warning("Vector update failed (non-fatal): %s", e)
137
+
138
+ result = {
139
+ "target_id": target_id,
140
+ "source_id": source_id,
141
+ "target_title": target.get("title"),
142
+ "source_title": source.get("title"),
143
+ "messages_moved": moved,
144
+ "duplicates_skipped": duplicate_count,
145
+ "new_message_count": new_count,
146
+ "vectors_updated": vectors_updated,
147
+ }
148
+
149
+ logger.info(
150
+ "Merged chat %d into %d: %d messages moved, %d duplicates skipped",
151
+ source_id,
152
+ target_id,
153
+ moved,
154
+ duplicate_count,
155
+ )
156
+ return result