footprinter-cli 1.0.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. footprinter/__init__.py +8 -0
  2. footprinter/access.py +431 -0
  3. footprinter/api/__init__.py +1 -0
  4. footprinter/api/db.py +61 -0
  5. footprinter/api/entities.py +250 -0
  6. footprinter/api/search.py +47 -0
  7. footprinter/api/semantic.py +33 -0
  8. footprinter/api/server.py +66 -0
  9. footprinter/api/status.py +15 -0
  10. footprinter/bundled/__init__.py +0 -0
  11. footprinter/bundled/config.example.yaml +161 -0
  12. footprinter/bundled/patterns/context_patterns.yaml +18 -0
  13. footprinter/bundled/patterns/extensions.yaml +283 -0
  14. footprinter/bundled/patterns/filename_patterns.yaml +61 -0
  15. footprinter/bundled/patterns/mime_mappings.yaml +68 -0
  16. footprinter/bundled/patterns/salesforce_rules.yaml +84 -0
  17. footprinter/bundled/patterns/security_patterns.yaml +27 -0
  18. footprinter/bundled/samples/hidden-client-file-sample.txt +2 -0
  19. footprinter/bundled/samples/opaque-project-file-sample.txt +2 -0
  20. footprinter/bundled/samples/visible-file-sample.txt +2 -0
  21. footprinter/cli/__init__.py +135 -0
  22. footprinter/cli/__main__.py +6 -0
  23. footprinter/cli/_common.py +327 -0
  24. footprinter/cli/_policy_helpers.py +646 -0
  25. footprinter/cli/_prompt.py +220 -0
  26. footprinter/cli/_sample_seed.py +204 -0
  27. footprinter/cli/api_cmd.py +32 -0
  28. footprinter/cli/connect.py +591 -0
  29. footprinter/cli/data.py +879 -0
  30. footprinter/cli/delete.py +128 -0
  31. footprinter/cli/ingest.py +543 -0
  32. footprinter/cli/mcp_cmd.py +750 -0
  33. footprinter/cli/mcp_setup.py +306 -0
  34. footprinter/cli/search.py +393 -0
  35. footprinter/cli/search_cmd.py +69 -0
  36. footprinter/cli/setup.py +2001 -0
  37. footprinter/cli/status.py +747 -0
  38. footprinter/cli/status_cmd.py +104 -0
  39. footprinter/cli/upsert.py +794 -0
  40. footprinter/cli/vectorize_cmd.py +215 -0
  41. footprinter/cli/view.py +322 -0
  42. footprinter/connectors/__init__.py +171 -0
  43. footprinter/connectors/config_utils.py +141 -0
  44. footprinter/db/__init__.py +37 -0
  45. footprinter/db/browser.py +198 -0
  46. footprinter/db/chats.py +602 -0
  47. footprinter/db/clients.py +307 -0
  48. footprinter/db/emails.py +279 -0
  49. footprinter/db/files.py +724 -0
  50. footprinter/db/folders.py +659 -0
  51. footprinter/db/messages.py +192 -0
  52. footprinter/db/policies.py +151 -0
  53. footprinter/db/projects.py +673 -0
  54. footprinter/db/search.py +573 -0
  55. footprinter/db/sql_utils.py +168 -0
  56. footprinter/db/status.py +320 -0
  57. footprinter/db/uploads.py +70 -0
  58. footprinter/ingest/__init__.py +0 -0
  59. footprinter/ingest/adapters/__init__.py +33 -0
  60. footprinter/ingest/adapters/browser.py +54 -0
  61. footprinter/ingest/adapters/chat.py +57 -0
  62. footprinter/ingest/adapters/ingest.py +146 -0
  63. footprinter/ingest/adapters/local_files.py +68 -0
  64. footprinter/ingest/adapters/local_folders.py +52 -0
  65. footprinter/ingest/adapters/protocol.py +174 -0
  66. footprinter/ingest/browser_indexer.py +216 -0
  67. footprinter/ingest/chat_dedup.py +156 -0
  68. footprinter/ingest/chat_indexer.py +487 -0
  69. footprinter/ingest/chat_parsers/__init__.py +8 -0
  70. footprinter/ingest/chat_parsers/chatgpt_parser.py +229 -0
  71. footprinter/ingest/chat_parsers/claude_parser.py +161 -0
  72. footprinter/ingest/cli.py +827 -0
  73. footprinter/ingest/content_extractors.py +117 -0
  74. footprinter/ingest/database.py +36 -0
  75. footprinter/ingest/db/__init__.py +1 -0
  76. footprinter/ingest/db/connector_schema.py +47 -0
  77. footprinter/ingest/db/migration.py +315 -0
  78. footprinter/ingest/db/schema.py +1043 -0
  79. footprinter/ingest/db/security.py +6 -0
  80. footprinter/ingest/file_indexer.py +223 -0
  81. footprinter/ingest/file_scanner.py +277 -0
  82. footprinter/ingest/folder_indexer.py +226 -0
  83. footprinter/ingest/full_content_extractor.py +321 -0
  84. footprinter/ingest/orchestrator.py +112 -0
  85. footprinter/ingest/pipe_runner.py +200 -0
  86. footprinter/ingest/processing.py +165 -0
  87. footprinter/ingest/registry.py +186 -0
  88. footprinter/ingest/run_record.py +91 -0
  89. footprinter/ingest/status.py +346 -0
  90. footprinter/mcp/__init__.py +0 -0
  91. footprinter/mcp/__main__.py +5 -0
  92. footprinter/mcp/db.py +67 -0
  93. footprinter/mcp/errors.py +105 -0
  94. footprinter/mcp/extraction.py +226 -0
  95. footprinter/mcp/server.py +39 -0
  96. footprinter/mcp/tools/__init__.py +0 -0
  97. footprinter/mcp/tools/navigation.py +70 -0
  98. footprinter/mcp/tools/read.py +75 -0
  99. footprinter/mcp/tools/search.py +158 -0
  100. footprinter/mcp/tools/semantic.py +79 -0
  101. footprinter/mcp/tools/status.py +19 -0
  102. footprinter/paths.py +117 -0
  103. footprinter/permissions.py +1152 -0
  104. footprinter/semantic/__init__.py +13 -0
  105. footprinter/semantic/chunking.py +52 -0
  106. footprinter/semantic/embeddings.py +23 -0
  107. footprinter/semantic/hybrid_search.py +273 -0
  108. footprinter/semantic/vector_store.py +471 -0
  109. footprinter/services/__init__.py +49 -0
  110. footprinter/services/access_service.py +342 -0
  111. footprinter/services/chat_service.py +85 -0
  112. footprinter/services/client_service.py +267 -0
  113. footprinter/services/content_service.py +181 -0
  114. footprinter/services/email_service.py +89 -0
  115. footprinter/services/file_service.py +83 -0
  116. footprinter/services/folder_service.py +122 -0
  117. footprinter/services/includes.py +19 -0
  118. footprinter/services/ingest_service.py +231 -0
  119. footprinter/services/project_service.py +262 -0
  120. footprinter/services/roles.py +25 -0
  121. footprinter/services/search_service.py +177 -0
  122. footprinter/services/semantic_service.py +360 -0
  123. footprinter/services/status_service.py +18 -0
  124. footprinter/services/visit_service.py +65 -0
  125. footprinter/source_registry.py +194 -0
  126. footprinter/utils/__init__.py +7 -0
  127. footprinter/utils/hash_utils.py +59 -0
  128. footprinter/utils/logging_config.py +68 -0
  129. footprinter/utils/mime.py +30 -0
  130. footprinter/utils/text.py +6 -0
  131. footprinter/utils/time.py +11 -0
  132. footprinter/visibility.py +1264 -0
  133. footprinter_cli-1.0.0rc1.dist-info/LICENSE +21 -0
  134. footprinter_cli-1.0.0rc1.dist-info/METADATA +223 -0
  135. footprinter_cli-1.0.0rc1.dist-info/RECORD +138 -0
  136. footprinter_cli-1.0.0rc1.dist-info/WHEEL +5 -0
  137. footprinter_cli-1.0.0rc1.dist-info/entry_points.txt +2 -0
  138. footprinter_cli-1.0.0rc1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,229 @@
1
+ """
2
+ Parser for ChatGPT chat export format.
3
+
4
+ ChatGPT exports chats as conversations.json with structure:
5
+ {
6
+ "title": "Chat Title",
7
+ "create_time": 1764563881.923587, # Unix timestamp
8
+ "update_time": 1764564263.347883,
9
+ "conversation_id": "uuid",
10
+ "mapping": {
11
+ "node-uuid": {
12
+ "parent": "parent-uuid" or None,
13
+ "children": ["child-uuid", ...],
14
+ "message": {
15
+ "author": {"role": "user"|"assistant"|"system"},
16
+ "content": {"content_type": "text", "parts": ["message text"]},
17
+ "create_time": 1764563881.0
18
+ }
19
+ }
20
+ }
21
+ }
22
+
23
+ Messages form a tree structure - we walk from root to extract in order.
24
+ """
25
+
26
+ import json
27
+ import logging
28
+ import uuid
29
+ from datetime import datetime, timezone
30
+ from pathlib import Path
31
+ from typing import Dict, Generator, List
32
+
33
+ from footprinter.utils.time import UTC_FMT
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+
38
+ class ChatGPTParser:
39
+ """Parser for ChatGPT chat export files."""
40
+
41
+ def __init__(self, export_file: Path):
42
+ """
43
+ Initialize parser with export file path.
44
+
45
+ Args:
46
+ export_file: Path to ChatGPT conversations.json
47
+ """
48
+ self.export_file = Path(export_file)
49
+
50
+ if not self.export_file.exists():
51
+ raise FileNotFoundError(f"conversations.json not found at {export_file}")
52
+
53
+ def parse_chats(self) -> Generator[Dict, None, None]:
54
+ """
55
+ Parse chats from conversations.json and yield chat records.
56
+
57
+ Yields:
58
+ Dict with chat data including messages
59
+ """
60
+ logger.info(f"Parsing ChatGPT export from {self.export_file}")
61
+
62
+ with open(self.export_file, "r", encoding="utf-8") as f:
63
+ chats_data = json.load(f)
64
+
65
+ logger.info(f"Found {len(chats_data)} chats")
66
+
67
+ for conv in chats_data:
68
+ messages = self._extract_messages(conv.get("mapping", {}))
69
+
70
+ # Convert timestamps
71
+ create_time = conv.get("create_time")
72
+ update_time = conv.get("update_time")
73
+ created_at = datetime.fromtimestamp(create_time, tz=timezone.utc).strftime(UTC_FMT) if create_time else None
74
+ updated_at = datetime.fromtimestamp(update_time, tz=timezone.utc).strftime(UTC_FMT) if update_time else None
75
+
76
+ yield {
77
+ "external_id": conv.get("conversation_id") # ChatGPT export format
78
+ or conv.get("id")
79
+ or str(uuid.uuid4()),
80
+ "source": "chatgpt",
81
+ "title": conv.get("title", ""),
82
+ "summary": "", # ChatGPT doesn't provide summaries
83
+ "created_at": created_at,
84
+ "updated_at": updated_at,
85
+ "message_count": len(messages),
86
+ "messages": messages,
87
+ "metadata": {
88
+ "model": conv.get("default_model_slug"),
89
+ "gizmo_id": conv.get("gizmo_id"),
90
+ "gizmo_type": conv.get("gizmo_type"),
91
+ "is_archived": conv.get("is_archived"),
92
+ },
93
+ }
94
+
95
+ def _extract_messages(self, mapping: Dict) -> List[Dict]:
96
+ """
97
+ Extract messages from the mapping tree structure.
98
+
99
+ Uses iterative BFS to avoid recursion depth issues with long chats.
100
+
101
+ Args:
102
+ mapping: The mapping dict from the chat
103
+
104
+ Returns:
105
+ List of message dicts in chronological order
106
+ """
107
+ if not mapping:
108
+ return []
109
+
110
+ # Find root node (parent is None)
111
+ root_id = None
112
+ for node_id, node_data in mapping.items():
113
+ if node_data.get("parent") is None:
114
+ root_id = node_id
115
+ break
116
+
117
+ if not root_id:
118
+ return []
119
+
120
+ # Walk tree iteratively using a stack (DFS)
121
+ messages = []
122
+ visited = set()
123
+ stack = [root_id]
124
+
125
+ while stack:
126
+ node_id = stack.pop(0) # BFS - pop from front for chronological order
127
+
128
+ if not node_id or node_id in visited:
129
+ continue
130
+
131
+ visited.add(node_id)
132
+ node = mapping.get(node_id, {})
133
+ msg = node.get("message")
134
+
135
+ if msg:
136
+ author = msg.get("author", {}).get("role", "unknown")
137
+
138
+ # Only include user and assistant messages
139
+ if author in ["user", "assistant"]:
140
+ content = self._extract_content(msg.get("content", {}))
141
+ create_time = msg.get("create_time")
142
+ created_at = (
143
+ datetime.fromtimestamp(create_time, tz=timezone.utc).strftime(UTC_FMT) if create_time else None
144
+ )
145
+
146
+ # Skip empty messages
147
+ if content.strip():
148
+ messages.append(
149
+ {
150
+ "message_id": msg.get("id") or str(uuid.uuid4()),
151
+ "role": author,
152
+ "content": content,
153
+ "created_at": created_at,
154
+ "metadata": {
155
+ "model_slug": msg.get("metadata", {}).get("model_slug"),
156
+ },
157
+ }
158
+ )
159
+
160
+ # Add children to stack (in reverse for correct order when popping)
161
+ children = node.get("children", [])
162
+ stack = children + stack # Add to front for DFS-like traversal
163
+
164
+ return messages
165
+
166
+ def _extract_content(self, content: Dict) -> str:
167
+ """
168
+ Extract text content from message content structure.
169
+
170
+ Args:
171
+ content: Content dict with content_type and parts
172
+
173
+ Returns:
174
+ Extracted text string
175
+ """
176
+ if not isinstance(content, dict):
177
+ return str(content) if content else ""
178
+
179
+ parts = content.get("parts", [])
180
+ if not parts:
181
+ return ""
182
+
183
+ # Parts is typically a list of strings or dicts
184
+ text_parts = []
185
+ for part in parts:
186
+ if isinstance(part, str):
187
+ text_parts.append(part)
188
+ elif isinstance(part, dict):
189
+ # Some parts are dicts (like images, code blocks)
190
+ if "text" in part:
191
+ text_parts.append(part["text"])
192
+
193
+ return "\n".join(text_parts)
194
+
195
+ def get_stats(self) -> Dict:
196
+ """
197
+ Get statistics about the export without full parsing.
198
+
199
+ Returns:
200
+ Dict with chat count, message count, date range
201
+ """
202
+ with open(self.export_file, "r", encoding="utf-8") as f:
203
+ chats_data = json.load(f)
204
+
205
+ total_messages = 0
206
+ dates = []
207
+
208
+ for conv in chats_data:
209
+ # Count user/assistant messages in mapping
210
+ mapping = conv.get("mapping", {})
211
+ for node_data in mapping.values():
212
+ msg = node_data.get("message")
213
+ if msg and msg.get("author", {}).get("role") in ["user", "assistant"]:
214
+ total_messages += 1
215
+
216
+ create_time = conv.get("create_time")
217
+ if create_time:
218
+ dates.append(datetime.fromtimestamp(create_time, tz=timezone.utc))
219
+
220
+ earliest = min(dates).strftime(UTC_FMT) if dates else None
221
+ latest = max(dates).strftime(UTC_FMT) if dates else None
222
+
223
+ return {
224
+ "total_chats": len(chats_data),
225
+ "chats_with_messages": sum(1 for conv in chats_data if conv.get("mapping")),
226
+ "total_messages": total_messages,
227
+ "earliest_chat": earliest,
228
+ "latest_chat": latest,
229
+ }
@@ -0,0 +1,161 @@
1
+ """
2
+ Parser for Claude chat export format.
3
+
4
+ Claude exports chats as a single conversations.json file with structure:
5
+ {
6
+ "uuid": "chat-id",
7
+ "name": "Chat Title",
8
+ "summary": "AI-generated summary",
9
+ "created_at": "2025-11-22T12:59:49.779155Z",
10
+ "updated_at": "2025-11-22T13:00:01.934675Z",
11
+ "chat_messages": [
12
+ {
13
+ "uuid": "message-id",
14
+ "text": "Full message text",
15
+ "content": [{"start_timestamp": "...", "type": "text", "text": "..."}]
16
+ }
17
+ ]
18
+ }
19
+
20
+ Messages alternate user/assistant - role inferred from position (even=user, odd=assistant).
21
+ """
22
+
23
+ import json
24
+ import logging
25
+ from pathlib import Path
26
+ from typing import Dict, Generator, List, Optional
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ class ClaudeParser:
32
+ """Parser for Claude chat export files."""
33
+
34
+ def __init__(self, export_dir: Path):
35
+ """
36
+ Initialize parser with export directory path.
37
+
38
+ Args:
39
+ export_dir: Path to Claude export directory containing conversations.json
40
+ """
41
+ self.export_dir = Path(export_dir)
42
+ self.chats_file = self.export_dir / "conversations.json"
43
+
44
+ if not self.chats_file.exists():
45
+ raise FileNotFoundError(f"conversations.json not found in {export_dir}")
46
+
47
+ def parse_chats(self) -> Generator[Dict, None, None]:
48
+ """
49
+ Parse chats from conversations.json and yield chat records.
50
+
51
+ Yields:
52
+ Dict with chat data including messages
53
+ """
54
+ logger.info(f"Parsing Claude export from {self.export_dir}")
55
+
56
+ with open(self.chats_file, "r", encoding="utf-8") as f:
57
+ chats_data = json.load(f)
58
+
59
+ logger.info(f"Found {len(chats_data)} chats")
60
+
61
+ for conv in chats_data:
62
+ messages = conv.get("chat_messages", [])
63
+
64
+ yield {
65
+ "external_id": conv["uuid"],
66
+ "source": "claude",
67
+ "title": conv.get("name", ""),
68
+ "summary": conv.get("summary", ""),
69
+ "created_at": conv.get("created_at"),
70
+ "updated_at": conv.get("updated_at"),
71
+ "message_count": len(messages),
72
+ "messages": self._parse_messages(messages),
73
+ "metadata": {
74
+ "account_uuid": conv.get("account", {}).get("uuid"),
75
+ },
76
+ }
77
+
78
+ def _parse_messages(self, messages: List[Dict]) -> List[Dict]:
79
+ """
80
+ Parse chat messages, inferring role from position.
81
+
82
+ Args:
83
+ messages: List of message dicts from export
84
+
85
+ Returns:
86
+ List of parsed message dicts with role, content, timestamp
87
+ """
88
+ parsed = []
89
+
90
+ for i, msg in enumerate(messages):
91
+ # Role alternates: even index = user, odd index = assistant
92
+ role = "user" if i % 2 == 0 else "assistant"
93
+
94
+ # Get timestamp from content array if available
95
+ created_at = None
96
+ if msg.get("content") and len(msg["content"]) > 0:
97
+ created_at = msg["content"][0].get("start_timestamp")
98
+
99
+ parsed.append(
100
+ {
101
+ "message_id": msg.get("uuid"),
102
+ "role": role,
103
+ "content": msg.get("text", ""),
104
+ "created_at": created_at,
105
+ "metadata": {},
106
+ }
107
+ )
108
+
109
+ return parsed
110
+
111
+ def get_stats(self) -> Dict:
112
+ """
113
+ Get statistics about the export without full parsing.
114
+
115
+ Returns:
116
+ Dict with chat count, message count, date range
117
+ """
118
+ with open(self.chats_file, "r", encoding="utf-8") as f:
119
+ chats_data = json.load(f)
120
+
121
+ total_messages = sum(len(conv.get("chat_messages", [])) for conv in chats_data)
122
+
123
+ dates = [conv.get("created_at") for conv in chats_data if conv.get("created_at")]
124
+ earliest = min(dates) if dates else None
125
+ latest = max(dates) if dates else None
126
+
127
+ return {
128
+ "total_chats": len(chats_data),
129
+ "chats_with_messages": sum(1 for conv in chats_data if conv.get("chat_messages")),
130
+ "total_messages": total_messages,
131
+ "earliest_chat": earliest,
132
+ "latest_chat": latest,
133
+ }
134
+
135
+ def parse_memories(self) -> Optional[List[Dict]]:
136
+ """
137
+ Parse memories.json if present.
138
+
139
+ Returns:
140
+ List of memory records or None if file doesn't exist
141
+ """
142
+ memories_file = self.export_dir / "memories.json"
143
+ if not memories_file.exists():
144
+ return None
145
+
146
+ with open(memories_file, "r", encoding="utf-8") as f:
147
+ return json.load(f)
148
+
149
+ def parse_projects(self) -> Optional[List[Dict]]:
150
+ """
151
+ Parse projects.json if present.
152
+
153
+ Returns:
154
+ List of project records or None if file doesn't exist
155
+ """
156
+ projects_file = self.export_dir / "projects.json"
157
+ if not projects_file.exists():
158
+ return None
159
+
160
+ with open(projects_file, "r", encoding="utf-8") as f:
161
+ return json.load(f)