@oriro/orirocli 0.1.9 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. package/README.md +16 -18
  2. package/dist/cli.js +4776 -2964
  3. package/package.json +2 -2
  4. package/skills/craft/ai-engineering/SKILL.md +2 -2
  5. package/skills/graphify/SKILL.md +0 -619
  6. package/skills/graphify/__init__.py +0 -28
  7. package/skills/graphify/__main__.py +0 -4582
  8. package/skills/graphify/affected.py +0 -154
  9. package/skills/graphify/always_on/agents-md.md +0 -12
  10. package/skills/graphify/always_on/antigravity-rules.md +0 -14
  11. package/skills/graphify/always_on/claude-md.md +0 -9
  12. package/skills/graphify/always_on/gemini-md.md +0 -9
  13. package/skills/graphify/always_on/kiro-steering.md +0 -5
  14. package/skills/graphify/always_on/vscode-instructions.md +0 -17
  15. package/skills/graphify/analyze.py +0 -724
  16. package/skills/graphify/benchmark.py +0 -155
  17. package/skills/graphify/build.py +0 -487
  18. package/skills/graphify/cache.py +0 -417
  19. package/skills/graphify/callflow_html.py +0 -2020
  20. package/skills/graphify/cluster.py +0 -272
  21. package/skills/graphify/command-kilo.md +0 -15
  22. package/skills/graphify/dedup.py +0 -429
  23. package/skills/graphify/detect.py +0 -1379
  24. package/skills/graphify/diagnostics.py +0 -390
  25. package/skills/graphify/export.py +0 -1408
  26. package/skills/graphify/extract.py +0 -11570
  27. package/skills/graphify/global_graph.py +0 -159
  28. package/skills/graphify/google_workspace.py +0 -223
  29. package/skills/graphify/hooks.py +0 -457
  30. package/skills/graphify/ingest.py +0 -331
  31. package/skills/graphify/llm.py +0 -1896
  32. package/skills/graphify/manifest.py +0 -4
  33. package/skills/graphify/mcp_ingest.py +0 -392
  34. package/skills/graphify/multigraph_compat.py +0 -212
  35. package/skills/graphify/pg_introspect.py +0 -142
  36. package/skills/graphify/prs.py +0 -748
  37. package/skills/graphify/querylog.py +0 -70
  38. package/skills/graphify/report.py +0 -218
  39. package/skills/graphify/scip_ingest.py +0 -363
  40. package/skills/graphify/security.py +0 -336
  41. package/skills/graphify/semantic_cleanup.py +0 -319
  42. package/skills/graphify/serve.py +0 -1309
  43. package/skills/graphify/skill-aider.md +0 -1246
  44. package/skills/graphify/skill-amp.md +0 -613
  45. package/skills/graphify/skill-claw.md +0 -616
  46. package/skills/graphify/skill-codex.md +0 -613
  47. package/skills/graphify/skill-copilot.md +0 -616
  48. package/skills/graphify/skill-devin.md +0 -1372
  49. package/skills/graphify/skill-droid.md +0 -613
  50. package/skills/graphify/skill-kilo.md +0 -625
  51. package/skills/graphify/skill-kiro.md +0 -615
  52. package/skills/graphify/skill-opencode.md +0 -608
  53. package/skills/graphify/skill-pi.md +0 -615
  54. package/skills/graphify/skill-trae.md +0 -614
  55. package/skills/graphify/skill-vscode.md +0 -612
  56. package/skills/graphify/skill-windows.md +0 -651
  57. package/skills/graphify/skills/amp/references/add-watch.md +0 -56
  58. package/skills/graphify/skills/amp/references/exports.md +0 -71
  59. package/skills/graphify/skills/amp/references/extraction-spec.md +0 -68
  60. package/skills/graphify/skills/amp/references/github-and-merge.md +0 -46
  61. package/skills/graphify/skills/amp/references/hooks.md +0 -33
  62. package/skills/graphify/skills/amp/references/query.md +0 -249
  63. package/skills/graphify/skills/amp/references/transcribe.md +0 -48
  64. package/skills/graphify/skills/amp/references/update.md +0 -179
  65. package/skills/graphify/skills/claude/references/add-watch.md +0 -56
  66. package/skills/graphify/skills/claude/references/exports.md +0 -71
  67. package/skills/graphify/skills/claude/references/extraction-spec.md +0 -68
  68. package/skills/graphify/skills/claude/references/github-and-merge.md +0 -46
  69. package/skills/graphify/skills/claude/references/hooks.md +0 -33
  70. package/skills/graphify/skills/claude/references/query.md +0 -103
  71. package/skills/graphify/skills/claude/references/transcribe.md +0 -48
  72. package/skills/graphify/skills/claude/references/update.md +0 -179
  73. package/skills/graphify/skills/claw/references/add-watch.md +0 -56
  74. package/skills/graphify/skills/claw/references/exports.md +0 -71
  75. package/skills/graphify/skills/claw/references/extraction-spec.md +0 -29
  76. package/skills/graphify/skills/claw/references/github-and-merge.md +0 -46
  77. package/skills/graphify/skills/claw/references/hooks.md +0 -33
  78. package/skills/graphify/skills/claw/references/query.md +0 -249
  79. package/skills/graphify/skills/claw/references/transcribe.md +0 -48
  80. package/skills/graphify/skills/claw/references/update.md +0 -179
  81. package/skills/graphify/skills/codex/references/add-watch.md +0 -56
  82. package/skills/graphify/skills/codex/references/exports.md +0 -71
  83. package/skills/graphify/skills/codex/references/extraction-spec.md +0 -29
  84. package/skills/graphify/skills/codex/references/github-and-merge.md +0 -46
  85. package/skills/graphify/skills/codex/references/hooks.md +0 -33
  86. package/skills/graphify/skills/codex/references/query.md +0 -249
  87. package/skills/graphify/skills/codex/references/transcribe.md +0 -48
  88. package/skills/graphify/skills/codex/references/update.md +0 -179
  89. package/skills/graphify/skills/copilot/references/add-watch.md +0 -56
  90. package/skills/graphify/skills/copilot/references/exports.md +0 -71
  91. package/skills/graphify/skills/copilot/references/extraction-spec.md +0 -68
  92. package/skills/graphify/skills/copilot/references/github-and-merge.md +0 -46
  93. package/skills/graphify/skills/copilot/references/hooks.md +0 -33
  94. package/skills/graphify/skills/copilot/references/query.md +0 -249
  95. package/skills/graphify/skills/copilot/references/transcribe.md +0 -48
  96. package/skills/graphify/skills/copilot/references/update.md +0 -179
  97. package/skills/graphify/skills/droid/references/add-watch.md +0 -56
  98. package/skills/graphify/skills/droid/references/exports.md +0 -71
  99. package/skills/graphify/skills/droid/references/extraction-spec.md +0 -68
  100. package/skills/graphify/skills/droid/references/github-and-merge.md +0 -46
  101. package/skills/graphify/skills/droid/references/hooks.md +0 -33
  102. package/skills/graphify/skills/droid/references/query.md +0 -249
  103. package/skills/graphify/skills/droid/references/transcribe.md +0 -48
  104. package/skills/graphify/skills/droid/references/update.md +0 -179
  105. package/skills/graphify/skills/kilo/references/add-watch.md +0 -56
  106. package/skills/graphify/skills/kilo/references/exports.md +0 -71
  107. package/skills/graphify/skills/kilo/references/extraction-spec.md +0 -68
  108. package/skills/graphify/skills/kilo/references/github-and-merge.md +0 -46
  109. package/skills/graphify/skills/kilo/references/hooks.md +0 -33
  110. package/skills/graphify/skills/kilo/references/query.md +0 -249
  111. package/skills/graphify/skills/kilo/references/transcribe.md +0 -48
  112. package/skills/graphify/skills/kilo/references/update.md +0 -179
  113. package/skills/graphify/skills/kiro/references/add-watch.md +0 -56
  114. package/skills/graphify/skills/kiro/references/exports.md +0 -71
  115. package/skills/graphify/skills/kiro/references/extraction-spec.md +0 -29
  116. package/skills/graphify/skills/kiro/references/github-and-merge.md +0 -46
  117. package/skills/graphify/skills/kiro/references/hooks.md +0 -33
  118. package/skills/graphify/skills/kiro/references/query.md +0 -249
  119. package/skills/graphify/skills/kiro/references/transcribe.md +0 -48
  120. package/skills/graphify/skills/kiro/references/update.md +0 -179
  121. package/skills/graphify/skills/opencode/references/add-watch.md +0 -56
  122. package/skills/graphify/skills/opencode/references/exports.md +0 -71
  123. package/skills/graphify/skills/opencode/references/extraction-spec.md +0 -68
  124. package/skills/graphify/skills/opencode/references/github-and-merge.md +0 -46
  125. package/skills/graphify/skills/opencode/references/hooks.md +0 -33
  126. package/skills/graphify/skills/opencode/references/query.md +0 -249
  127. package/skills/graphify/skills/opencode/references/transcribe.md +0 -48
  128. package/skills/graphify/skills/opencode/references/update.md +0 -179
  129. package/skills/graphify/skills/pi/references/add-watch.md +0 -56
  130. package/skills/graphify/skills/pi/references/exports.md +0 -71
  131. package/skills/graphify/skills/pi/references/extraction-spec.md +0 -29
  132. package/skills/graphify/skills/pi/references/github-and-merge.md +0 -46
  133. package/skills/graphify/skills/pi/references/hooks.md +0 -33
  134. package/skills/graphify/skills/pi/references/query.md +0 -249
  135. package/skills/graphify/skills/pi/references/transcribe.md +0 -48
  136. package/skills/graphify/skills/pi/references/update.md +0 -179
  137. package/skills/graphify/skills/trae/references/add-watch.md +0 -56
  138. package/skills/graphify/skills/trae/references/exports.md +0 -71
  139. package/skills/graphify/skills/trae/references/extraction-spec.md +0 -68
  140. package/skills/graphify/skills/trae/references/github-and-merge.md +0 -46
  141. package/skills/graphify/skills/trae/references/hooks.md +0 -35
  142. package/skills/graphify/skills/trae/references/query.md +0 -249
  143. package/skills/graphify/skills/trae/references/transcribe.md +0 -48
  144. package/skills/graphify/skills/trae/references/update.md +0 -179
  145. package/skills/graphify/skills/vscode/references/add-watch.md +0 -56
  146. package/skills/graphify/skills/vscode/references/exports.md +0 -71
  147. package/skills/graphify/skills/vscode/references/extraction-spec.md +0 -68
  148. package/skills/graphify/skills/vscode/references/github-and-merge.md +0 -46
  149. package/skills/graphify/skills/vscode/references/hooks.md +0 -33
  150. package/skills/graphify/skills/vscode/references/query.md +0 -249
  151. package/skills/graphify/skills/vscode/references/transcribe.md +0 -48
  152. package/skills/graphify/skills/vscode/references/update.md +0 -179
  153. package/skills/graphify/skills/windows/references/add-watch.md +0 -56
  154. package/skills/graphify/skills/windows/references/exports.md +0 -71
  155. package/skills/graphify/skills/windows/references/extraction-spec.md +0 -68
  156. package/skills/graphify/skills/windows/references/github-and-merge.md +0 -46
  157. package/skills/graphify/skills/windows/references/hooks.md +0 -33
  158. package/skills/graphify/skills/windows/references/query.md +0 -249
  159. package/skills/graphify/skills/windows/references/transcribe.md +0 -48
  160. package/skills/graphify/skills/windows/references/update.md +0 -179
  161. package/skills/graphify/symbol_resolution.py +0 -538
  162. package/skills/graphify/transcribe.py +0 -184
  163. package/skills/graphify/tree_html.py +0 -582
  164. package/skills/graphify/validate.py +0 -72
  165. package/skills/graphify/watch.py +0 -898
  166. package/skills/graphify/wiki.py +0 -282
@@ -1,1379 +0,0 @@
1
- # file discovery, type classification, and corpus health checks
2
- from __future__ import annotations
3
- import fnmatch
4
- import json
5
- import os
6
- import re
7
- import shlex
8
- from enum import Enum
9
- from pathlib import Path
10
-
11
- from graphify.google_workspace import (
12
- GOOGLE_WORKSPACE_EXTENSIONS,
13
- convert_google_workspace_file,
14
- google_workspace_enabled,
15
- )
16
-
17
-
18
- class FileType(str, Enum):
19
- CODE = "code"
20
- DOCUMENT = "document"
21
- PAPER = "paper"
22
- IMAGE = "image"
23
- VIDEO = "video"
24
-
25
-
26
- _MANIFEST_PATH = "graphify-out/manifest.json"
27
-
28
- CODE_EXTENSIONS = {'.py', '.ts', '.tsx', '.js', '.jsx', '.mjs', '.ejs', '.ets', '.go', '.rs', '.java', '.groovy', '.gradle', '.cpp', '.cc', '.cxx', '.c', '.h', '.hpp', '.rb', '.swift', '.kt', '.kts', '.cs', '.scala', '.php', '.lua', '.luau', '.toc', '.zig', '.ps1', '.ex', '.exs', '.m', '.mm', '.jl', '.vue', '.svelte', '.astro', '.dart', '.v', '.sv', '.svh', '.sql', '.r', '.f', '.F', '.f90', '.F90', '.f95', '.F95', '.f03', '.F03', '.f08', '.F08', '.pas', '.pp', '.dpr', '.dpk', '.lpr', '.inc', '.dfm', '.lfm', '.lpk', '.sh', '.bash', '.json', '.tf', '.tfvars', '.hcl', '.dm', '.dme', '.dmi', '.dmm', '.dmf', '.sln', '.csproj', '.fsproj', '.vbproj', '.razor', '.cshtml', '.cls', '.trigger'}
29
- DOC_EXTENSIONS = {'.md', '.mdx', '.qmd', '.txt', '.rst', '.html', '.yaml', '.yml'}
30
- PAPER_EXTENSIONS = {'.pdf'}
31
- IMAGE_EXTENSIONS = {'.png', '.jpg', '.jpeg', '.gif', '.webp', '.svg'}
32
- OFFICE_EXTENSIONS = {'.docx', '.xlsx'}
33
- VIDEO_EXTENSIONS = {'.mp4', '.mov', '.webm', '.mkv', '.avi', '.m4v', '.mp3', '.wav', '.m4a', '.ogg'}
34
-
35
- CORPUS_WARN_THRESHOLD = 50_000 # words - below this, warn "you may not need a graph"
36
- CORPUS_UPPER_THRESHOLD = 500_000 # words - above this, warn about token cost
37
- FILE_COUNT_UPPER = 500 # files - above this, warn about token cost
38
-
39
- # Resource caps for parsing untrusted office/PDF files (F2). A corpus is
40
- # attacker-controllable (graphify runs on cloned/shared folders), and .docx/.xlsx
41
- # are zip+XML containers: a few-KB zip-bomb can decompress to gigabytes and
42
- # OOM-kill the process at load_workbook/Document time. Screen the file before any
43
- # parser touches it.
44
- _OFFICE_MAX_RAW_BYTES = 50 * 1024 * 1024 # 50 MiB on-disk
45
- _OFFICE_MAX_DECOMPRESSED_BYTES = 512 * 1024 * 1024 # 512 MiB total uncompressed
46
- _OFFICE_MAX_COMPRESSION_RATIO = 200 # uncompressed : compressed
47
-
48
-
49
- def _file_within_size_cap(path: Path, cap: int = _OFFICE_MAX_RAW_BYTES) -> bool:
50
- """True if *path* exists and its on-disk size is within *cap*."""
51
- try:
52
- return path.stat().st_size <= cap
53
- except OSError:
54
- return False
55
-
56
-
57
- def _zip_within_caps(path: Path) -> bool:
58
- """Reject a zip-based office file that is a likely zip/XML bomb.
59
-
60
- Two layers, because the zip central-directory sizes are attacker-controlled:
61
- 1. A cheap pre-filter on the declared sizes (on-disk cap, summed-uncompressed
62
- cap, compression ratio) that rejects an honest bomb without decompressing.
63
- 2. An authoritative pass that stream-decompresses every member with a hard
64
- byte ceiling, so a member that under-declares its size in the central
65
- directory cannot expand past the cap undetected. Decompression is chunked
66
- and bounded, so checking a bomb never materializes more than the ceiling.
67
- """
68
- import zipfile
69
- if not _file_within_size_cap(path):
70
- return False
71
- try:
72
- with zipfile.ZipFile(path) as zf:
73
- infos = zf.infolist()
74
- compressed = sum(i.compress_size for i in infos) or 1
75
- declared = sum(i.file_size for i in infos)
76
- if declared > _OFFICE_MAX_DECOMPRESSED_BYTES:
77
- return False
78
- if declared / compressed > _OFFICE_MAX_COMPRESSION_RATIO:
79
- return False
80
- total = 0
81
- for info in infos:
82
- with zf.open(info) as member:
83
- while True:
84
- chunk = member.read(1024 * 1024)
85
- if not chunk:
86
- break
87
- total += len(chunk)
88
- if total > _OFFICE_MAX_DECOMPRESSED_BYTES:
89
- return False
90
- except (zipfile.BadZipFile, OSError, EOFError):
91
- return False
92
- return True
93
-
94
- # Parent directories whose contents are always sensitive.
95
- # Checked against path.parts[:-1] (parents only) so a root-level file named
96
- # "credentials" or "secrets" is not falsely flagged by this stage.
97
- _SENSITIVE_DIRS = frozenset({
98
- ".ssh", ".gnupg", ".aws", ".gcloud", "secrets", ".secrets", "credentials",
99
- })
100
-
101
- # Files that may contain secrets - skip silently. These patterns are specific
102
- # (extensions, exact credential-store names) and always apply.
103
- _SENSITIVE_PATTERNS = [
104
- re.compile(r'(^|[\\/])\.(env|envrc)(\.|$)', re.IGNORECASE),
105
- re.compile(r'\.(pem|key|p12|pfx|cert|crt|der|p8)$', re.IGNORECASE),
106
- re.compile(r'(id_rsa|id_dsa|id_ecdsa|id_ed25519)(\.pub)?$'),
107
- re.compile(r'(\.netrc|\.pgpass|\.htpasswd)$', re.IGNORECASE),
108
- re.compile(r'(aws_credentials|gcloud_credentials|service.account)', re.IGNORECASE),
109
- ]
110
-
111
- # Generic keyword patterns - these only count when the keyword is LOAD-BEARING
112
- # in the filename (see _generic_keyword_hit), because a keyword buried mid-phrase
113
- # in a long descriptive slug names a topic, not a credential store:
114
- # "token-economics-of-recall.md" is a note ABOUT tokens; "api_token.txt" IS one.
115
- # Uses lookarounds instead of \b so underscore-prefixed names like api_token.txt
116
- # match. Both patterns use (?![a-zA-Z]) so that the trailing-underscore behavior
117
- # is consistent: "secret_store.txt" IS flagged, "tokenizer.py" is NOT (because
118
- # "i" after "token" is alpha and blocks the match).
119
- # `token` is kept separate because its longer suffix "izer"/"ize" is the only
120
- # common false-positive; other keywords have no such well-known derivatives.
121
- _GENERIC_KEYWORD_PATTERNS = [
122
- re.compile(r'(?<![a-zA-Z0-9])(credential|secret|passwd|password|private_key)s?(?![a-zA-Z])', re.IGNORECASE),
123
- re.compile(r'(?<![a-zA-Z0-9])tokens?(?![a-zA-Z])', re.IGNORECASE),
124
- ]
125
-
126
- # Word separators for the load-bearing check (underscore intentionally included;
127
- # multi-word keywords like private_key are handled by the end-of-stem check,
128
- # which runs before word counting).
129
- _WORD_SPLIT = re.compile(r'[-_\s]+')
130
-
131
-
132
- def _generic_keyword_hit(name: str) -> bool:
133
- """True if a generic secret keyword appears load-bearing in the filename.
134
-
135
- Secret-store files name their contents, and in English compounds the
136
- content noun is the head, which comes last: "github-personal-access-token",
137
- "api_token", "oauth_token". A keyword that is neither at the end of the
138
- stem nor in a short (<=2 word) name is a topic word in a descriptive slug
139
- ("token-economics-of-recall.md", "password-policy-discussion.md") and must
140
- not cause the file to be silently dropped from the graph (#436, #718).
141
- """
142
- # Stem = name up to the first dot, ignoring leading dots so dotfiles like
143
- # ".token" keep their keyword ("" stems would never match).
144
- stem = name.lstrip('.').split('.')[0]
145
- for pat in _GENERIC_KEYWORD_PATTERNS:
146
- hit = False
147
- for m in pat.finditer(stem):
148
- hit = True
149
- if m.end() == len(stem): # keyword ends the stem -> names the contents
150
- return True
151
- if hit and len([w for w in _WORD_SPLIT.split(stem) if w]) <= 2:
152
- return True # short name like token_config.yaml / secret_handler.txt
153
- return False
154
-
155
- # Signals that a .md/.txt file is actually a converted academic paper
156
- _PAPER_SIGNALS = [
157
- re.compile(r'\barxiv\b', re.IGNORECASE),
158
- re.compile(r'\bdoi\s*:', re.IGNORECASE),
159
- re.compile(r'\babstract\b', re.IGNORECASE),
160
- re.compile(r'\bproceedings\b', re.IGNORECASE),
161
- re.compile(r'\bjournal\b', re.IGNORECASE),
162
- re.compile(r'\bpreprint\b', re.IGNORECASE),
163
- re.compile(r'\\cite\{'), # LaTeX citation
164
- re.compile(r'\[\d+\]'), # Numbered citation [1], [23] (inline)
165
- re.compile(r'\[\n\d+\n\]'), # Numbered citation spread across lines (markdown conversion)
166
- re.compile(r'eq\.\s*\d+|equation\s+\d+', re.IGNORECASE),
167
- re.compile(r'\d{4}\.\d{4,5}'), # arXiv ID like 1706.03762
168
- re.compile(r'\bwe propose\b', re.IGNORECASE), # common academic phrasing
169
- re.compile(r'\bliterature\b', re.IGNORECASE), # "from the literature"
170
- ]
171
- _PAPER_SIGNAL_THRESHOLD = 3 # need at least this many signals to call it a paper
172
-
173
-
174
- def _is_sensitive(path: Path) -> bool:
175
- """Return True if this file likely contains secrets and should be skipped."""
176
- # Stage 1: any PARENT directory is a known secrets dir (parts[:-1] excludes
177
- # the filename itself so a root-level file named "credentials" is not falsely
178
- # skipped — the name patterns in Stage 2 handle the filename).
179
- if any(part in _SENSITIVE_DIRS for part in path.parts[:-1]):
180
- return True
181
- # Stage 2: filename pattern match
182
- name = path.name
183
- if any(p.search(name) for p in _SENSITIVE_PATTERNS):
184
- return True
185
- # Stage 3: generic keywords, only when load-bearing in the name
186
- return _generic_keyword_hit(name)
187
-
188
-
189
- def _looks_like_paper(path: Path) -> bool:
190
- """Heuristic: does this text file read like an academic paper?"""
191
- try:
192
- # Only scan first 3000 chars for speed
193
- text = path.read_text(encoding="utf-8", errors="ignore")[:3000]
194
- hits = sum(1 for pattern in _PAPER_SIGNALS if pattern.search(text))
195
- return hits >= _PAPER_SIGNAL_THRESHOLD
196
- except Exception:
197
- return False
198
-
199
-
200
- _ASSET_DIR_MARKERS = {".imageset", ".xcassets", ".appiconset", ".colorset", ".launchimage"}
201
-
202
-
203
- _SHEBANG_CODE_INTERPRETERS = {
204
- "python", "python3", "python2",
205
- "ruby", "perl", "node", "nodejs",
206
- "bash", "sh", "dash", "zsh", "fish", "ksh", "tcsh",
207
- "lua", "php", "julia", "Rscript",
208
- }
209
-
210
-
211
- def _split_env_s(value: str, rest: list[str]) -> list[str]:
212
- """Re-tokenize an `env -S`/`--split-string` packed command, prepending the
213
- operand to any trailing args. Returns the unpacked argv."""
214
- packed = " ".join([value, *rest]).strip()
215
- return shlex.split(packed)
216
-
217
-
218
- def _env_command_args(args: list[str], *, allow_split: bool = True) -> list[str]:
219
- """Strip leading env(1) options and var assignments, return the trailing
220
- command argv. Covers macOS/BSD and GNU coreutils env documented spellings.
221
-
222
- POSIX/macOS short forms:
223
- env [-0iv] [-C workdir] [-P utilpath] [-S string]
224
- [-u name] [name=value ...] [utility [argument ...]]
225
-
226
- GNU coreutils long/compact forms additionally supported:
227
- --argv0=ARG / -a ARG / -aARG
228
- --unset=NAME / --unset NAME / -u NAME / -uNAME
229
- --chdir=DIR / --chdir DIR / -C DIR / -CDIR
230
- --split-string=STRING / --split-string STRING
231
- -S STRING / -SSTRING / -vS STRING / -vSSTRING
232
- --ignore-environment / --null / --debug / --list-signal-handling
233
- --default-signal[=SIG] / --ignore-signal[=SIG] / --block-signal[=SIG]
234
-
235
- `-S` / `--split-string` payloads are themselves env-style argument lists
236
- per the GNU shebang synopsis:
237
- #!/usr/bin/env -[v]S[option]... [name=value]... command [args]...
238
- so after splitting the payload we recursively re-parse it with
239
- `allow_split=False` (a nested -S inside a split payload is rejected to
240
- bound recursion).
241
-
242
- Unknown hyphen-prefixed args yield [] (we refuse to guess whether
243
- their next token is an interpreter or an operand).
244
- """
245
- i = 0
246
- while i < len(args):
247
- arg = args[i]
248
-
249
- if arg == "--":
250
- return args[i + 1:]
251
-
252
- # Split-string forms: tokenize the packed payload, then re-parse it
253
- # as env args (so leading assignments/flags inside the payload are
254
- # skipped before the interpreter is identified).
255
- if allow_split:
256
- if arg == "-S":
257
- if i + 1 >= len(args):
258
- return []
259
- return _env_command_args(
260
- _split_env_s(" ".join(args[i + 1:]), []),
261
- allow_split=False,
262
- )
263
- if arg.startswith("-S") and len(arg) > 2:
264
- return _env_command_args(
265
- _split_env_s(arg[2:], args[i + 1:]),
266
- allow_split=False,
267
- )
268
- if arg == "-vS":
269
- if i + 1 >= len(args):
270
- return []
271
- return _env_command_args(
272
- _split_env_s(" ".join(args[i + 1:]), []),
273
- allow_split=False,
274
- )
275
- if arg.startswith("-vS") and len(arg) > 3:
276
- return _env_command_args(
277
- _split_env_s(arg[3:], args[i + 1:]),
278
- allow_split=False,
279
- )
280
- if arg.startswith("--split-string="):
281
- return _env_command_args(
282
- _split_env_s(arg.split("=", 1)[1], args[i + 1:]),
283
- allow_split=False,
284
- )
285
- if arg == "--split-string":
286
- if i + 1 >= len(args):
287
- return []
288
- return _env_command_args(
289
- _split_env_s(args[i + 1], args[i + 2:]),
290
- allow_split=False,
291
- )
292
-
293
- # Options with separate required operand
294
- if arg in {"-u", "-C", "-P", "-a", "--unset", "--chdir", "--argv0"}:
295
- if i + 2 > len(args):
296
- return []
297
- i += 2
298
- continue
299
-
300
- # Clumped short option + operand
301
- if (
302
- arg.startswith(("-u", "-C", "-P", "-a"))
303
- and len(arg) > 2
304
- and not arg.startswith("--")
305
- ):
306
- i += 1
307
- continue
308
-
309
- # Long option with `=` operand
310
- if arg.startswith(("--unset=", "--chdir=", "--argv0=")):
311
- i += 1
312
- continue
313
-
314
- # No-operand flags
315
- if arg in {"-", "-i", "-0", "-v", "--ignore-environment", "--null",
316
- "--debug", "--list-signal-handling"}:
317
- i += 1
318
- continue
319
-
320
- # Signal-handling long flags (with or without =SIG operand — we treat
321
- # them as no-effect for interpreter-resolution purposes)
322
- if arg.startswith(("--default-signal", "--ignore-signal", "--block-signal")):
323
- i += 1
324
- continue
325
-
326
- # Unknown hyphen-prefixed: refuse to guess
327
- if arg.startswith("-"):
328
- return []
329
-
330
- # Inline NAME=value assignment
331
- if "=" in arg:
332
- i += 1
333
- continue
334
-
335
- # First non-option, non-assignment token starts the command argv
336
- return args[i:]
337
-
338
- return []
339
-
340
-
341
- def _shebang_interpreter(path: Path) -> str | None:
342
- """Return the interpreter name from a shebang line.
343
-
344
- Handles forms that a naive parser misses:
345
- - `#!/usr/bin/env -S python3 -u` (env -S split-args form, anywhere)
346
- - `#!/usr/bin/env -i bash` (no-operand env flags)
347
- - `#!/usr/bin/env -u VAR python3` (env options with operands)
348
- - `#!/usr/bin/env -C /tmp python3` (env -C workdir)
349
- - `#!/usr/bin/env -P /bin python3` (env -P utilpath)
350
- - `#!/usr/bin/env DEBUG=1 python3` (inline var assignment)
351
- - `#!"/usr/local/bin/python with spaces"` (shlex handles quotes)
352
-
353
- Returns the basename of the resolved interpreter, or None if there is
354
- no shebang / the file is unreadable / parsing fails.
355
- """
356
- try:
357
- with path.open("rb") as f:
358
- first = f.read(256)
359
- if not first.startswith(b"#!"):
360
- return None
361
- line = first.split(b"\n")[0].decode(errors="replace")[2:].strip()
362
- parts = shlex.split(line)
363
- if not parts:
364
- return None
365
- interp = Path(parts[0]).name
366
- if interp == "env":
367
- env_args = _env_command_args(parts[1:])
368
- if not env_args:
369
- return None
370
- interp = Path(env_args[0]).name
371
- return interp
372
- except (OSError, ValueError):
373
- return None
374
-
375
-
376
- def _shebang_file_type(path: Path) -> FileType | None:
377
- """Peek at the first line of an extensionless file for a shebang."""
378
- interp = _shebang_interpreter(path)
379
- if interp in _SHEBANG_CODE_INTERPRETERS:
380
- return FileType.CODE
381
- return None
382
-
383
-
384
- def classify_file(path: Path) -> FileType | None:
385
- # Compound extensions must be checked before simple suffix lookup
386
- if path.name.lower().endswith(".blade.php"):
387
- return FileType.CODE
388
- ext = path.suffix.lower()
389
- if not ext:
390
- return _shebang_file_type(path)
391
- if ext in CODE_EXTENSIONS:
392
- return FileType.CODE
393
- if ext in PAPER_EXTENSIONS:
394
- # PDFs inside Xcode asset catalogs are vector icons, not papers
395
- if any(part.endswith(tuple(_ASSET_DIR_MARKERS)) for part in path.parts):
396
- return None
397
- return FileType.PAPER
398
- if ext in IMAGE_EXTENSIONS:
399
- return FileType.IMAGE
400
- if ext in DOC_EXTENSIONS:
401
- # Check if it's a converted paper
402
- if _looks_like_paper(path):
403
- return FileType.PAPER
404
- return FileType.DOCUMENT
405
- if ext in OFFICE_EXTENSIONS:
406
- return FileType.DOCUMENT
407
- if ext in GOOGLE_WORKSPACE_EXTENSIONS:
408
- return FileType.DOCUMENT
409
- if ext in VIDEO_EXTENSIONS:
410
- return FileType.VIDEO
411
- return None
412
-
413
-
414
- def extract_pdf_text(path: Path) -> str:
415
- """Extract plain text from a PDF file using pypdf."""
416
- if not _file_within_size_cap(path):
417
- return ""
418
- try:
419
- from pypdf import PdfReader
420
- reader = PdfReader(str(path))
421
- pages = []
422
- for page in reader.pages:
423
- text = page.extract_text()
424
- if text:
425
- pages.append(text)
426
- return "\n".join(pages)
427
- except Exception:
428
- return ""
429
-
430
-
431
- def docx_to_markdown(path: Path) -> str:
432
- """Convert a .docx file to markdown text using python-docx."""
433
- if not _zip_within_caps(path):
434
- return ""
435
- try:
436
- from docx import Document
437
- from docx.oxml.ns import qn
438
- doc = Document(str(path))
439
- lines = []
440
- for para in doc.paragraphs:
441
- style = para.style.name if para.style else ""
442
- text = para.text.strip()
443
- if not text:
444
- lines.append("")
445
- continue
446
- if style.startswith("Heading 1"):
447
- lines.append(f"# {text}")
448
- elif style.startswith("Heading 2"):
449
- lines.append(f"## {text}")
450
- elif style.startswith("Heading 3"):
451
- lines.append(f"### {text}")
452
- elif style.startswith("List"):
453
- lines.append(f"- {text}")
454
- else:
455
- lines.append(text)
456
- # Tables
457
- for table in doc.tables:
458
- rows = [[cell.text.strip() for cell in row.cells] for row in table.rows]
459
- if not rows:
460
- continue
461
- header = "| " + " | ".join(rows[0]) + " |"
462
- sep = "| " + " | ".join("---" for _ in rows[0]) + " |"
463
- lines.extend([header, sep])
464
- for row in rows[1:]:
465
- lines.append("| " + " | ".join(row) + " |")
466
- return "\n".join(lines)
467
- except ImportError:
468
- return ""
469
- except Exception:
470
- return ""
471
-
472
-
473
- def xlsx_to_markdown(path: Path) -> str:
474
- """Convert an .xlsx file to markdown text using openpyxl."""
475
- if not _zip_within_caps(path):
476
- return ""
477
- try:
478
- import openpyxl
479
- wb = openpyxl.load_workbook(str(path), read_only=True, data_only=True)
480
- sections = []
481
- for sheet_name in wb.sheetnames:
482
- ws = wb[sheet_name]
483
- rows = []
484
- for row in ws.iter_rows(values_only=True):
485
- if all(cell is None for cell in row):
486
- continue
487
- rows.append([str(cell) if cell is not None else "" for cell in row])
488
- if not rows:
489
- continue
490
- sections.append(f"## Sheet: {sheet_name}")
491
- if len(rows) >= 1:
492
- header = "| " + " | ".join(rows[0]) + " |"
493
- sep = "| " + " | ".join("---" for _ in rows[0]) + " |"
494
- sections.extend([header, sep])
495
- for row in rows[1:]:
496
- sections.append("| " + " | ".join(row) + " |")
497
- wb.close()
498
- return "\n".join(sections)
499
- except ImportError:
500
- return ""
501
- except Exception:
502
- return ""
503
-
504
-
505
- def xlsx_extract_structure(path: Path) -> dict:
506
- """Extract structural nodes (sheets, named tables, column headers) from an .xlsx file.
507
-
508
- Returns a nodes/edges dict compatible with the graphify extract pipeline.
509
- Used in addition to xlsx_to_markdown so Claude sees both structure and content.
510
- """
511
- def _nid(*parts: str) -> str:
512
- return re.sub(r"[^a-z0-9_]", "_", "_".join(p.lower() for p in parts).strip("_"))
513
-
514
- try:
515
- import openpyxl
516
- except ImportError:
517
- return {"nodes": [], "edges": []}
518
-
519
- try:
520
- wb = openpyxl.load_workbook(str(path), read_only=False, data_only=True)
521
- except Exception:
522
- return {"nodes": [], "edges": []}
523
-
524
- # F-035: typo fix — was `_re.sub` (NameError, but unreachable because the
525
- # whole xlsx codepath is currently behind a feature flag / not yet wired
526
- # into the dispatcher). Before re-enabling this path, re-audit it for
527
- # zip/XML bombs (openpyxl is built on top of zipfile and lxml-style XML
528
- # parsing — a malicious .xlsx can blow up memory at load_workbook time).
529
- stem = re.sub(r"[^a-z0-9]", "_", path.stem.lower())
530
- str_path = str(path)
531
- file_nid = _nid(str_path)
532
- nodes: list[dict] = [{"id": file_nid, "label": path.name, "file_type": "document",
533
- "source_file": str_path, "source_location": None}]
534
- edges: list[dict] = []
535
- seen: set[str] = {file_nid}
536
-
537
- def _add(nid: str, label: str) -> None:
538
- if nid not in seen:
539
- seen.add(nid)
540
- nodes.append({"id": nid, "label": label, "file_type": "document",
541
- "source_file": str_path, "source_location": None})
542
-
543
- def _edge(src: str, tgt: str, relation: str) -> None:
544
- edges.append({"source": src, "target": tgt, "relation": relation,
545
- "confidence": "EXTRACTED", "source_file": str_path,
546
- "source_location": None, "weight": 1.0})
547
-
548
- for sheet_name in wb.sheetnames:
549
- ws = wb[sheet_name]
550
- sheet_nid = _nid(stem, sheet_name)
551
- _add(sheet_nid, f"{sheet_name} (sheet)")
552
- _edge(file_nid, sheet_nid, "contains")
553
-
554
- # Named Excel Tables (ListObjects)
555
- if hasattr(ws, "tables"):
556
- for tbl in ws.tables.values():
557
- tbl_nid = _nid(stem, sheet_name, tbl.name)
558
- _add(tbl_nid, tbl.name)
559
- _edge(sheet_nid, tbl_nid, "contains")
560
- # Column headers from table header row
561
- ref = tbl.ref # e.g. "A1:D10"
562
- if ref:
563
- try:
564
- from openpyxl.utils import range_boundaries
565
- min_col, min_row, max_col, _ = range_boundaries(ref)
566
- header_row = list(ws.iter_rows(min_row=min_row, max_row=min_row,
567
- min_col=min_col, max_col=max_col,
568
- values_only=True))
569
- if header_row:
570
- for col_name in header_row[0]:
571
- if col_name:
572
- col_nid = _nid(stem, tbl.name, str(col_name))
573
- _add(col_nid, str(col_name))
574
- _edge(tbl_nid, col_nid, "contains")
575
- except Exception:
576
- pass
577
- else:
578
- # Fallback: first non-empty row as column headers
579
- for row in ws.iter_rows(max_row=1, values_only=True):
580
- for cell in row:
581
- if cell:
582
- col_nid = _nid(stem, sheet_name, str(cell))
583
- _add(col_nid, str(cell))
584
- _edge(sheet_nid, col_nid, "contains")
585
- break
586
-
587
- try:
588
- wb.close()
589
- except Exception:
590
- pass
591
-
592
- return {"nodes": nodes, "edges": edges}
593
-
594
-
595
- def convert_office_file(path: Path, out_dir: Path) -> Path | None:
596
- """Convert a .docx or .xlsx to a markdown sidecar in out_dir.
597
-
598
- Returns the path of the converted .md file, or None if conversion failed
599
- or the required library is not installed.
600
- """
601
- ext = path.suffix.lower()
602
- if ext == ".docx":
603
- text = docx_to_markdown(path)
604
- elif ext == ".xlsx":
605
- text = xlsx_to_markdown(path)
606
- else:
607
- return None
608
-
609
- if not text.strip():
610
- return None
611
-
612
- out_dir.mkdir(parents=True, exist_ok=True)
613
- # Use a stable name derived from the original path to avoid collisions
614
- import hashlib
615
- name_hash = hashlib.sha256(str(path.resolve()).encode()).hexdigest()[:8]
616
- out_path = out_dir / f"{path.stem}_{name_hash}.md"
617
- out_path.write_text(
618
- f"<!-- converted from {path.name} -->\n\n{text}",
619
- encoding="utf-8",
620
- )
621
- return out_path
622
-
623
-
624
- def count_words(path: Path) -> int:
625
- try:
626
- ext = path.suffix.lower()
627
- if ext == ".pdf":
628
- return len(extract_pdf_text(path).split())
629
- if ext == ".docx":
630
- return len(docx_to_markdown(path).split())
631
- if ext == ".xlsx":
632
- return len(xlsx_to_markdown(path).split())
633
- return len(path.read_text(encoding="utf-8", errors="ignore").split())
634
- except Exception:
635
- return 0
636
-
637
-
638
- # Directory names to always skip - venvs, caches, build artifacts, deps
639
- _SKIP_DIRS = {
640
- "venv", ".venv", "env", ".env",
641
- "node_modules", "__pycache__", ".git",
642
- "dist", "build", "target", "out",
643
- "site-packages", "lib64",
644
- ".pytest_cache", ".mypy_cache", ".ruff_cache",
645
- ".tox", ".eggs", "*.egg-info",
646
- "graphify-out", # never treat own output as source input (#524)
647
- # Coverage/test-artefact dirs — generated, never architecturally meaningful
648
- "coverage", "lcov-report", # Vitest/Istanbul/nyc HTML reports (#870)
649
- "visual-tests", "visual-test", # Playwright/visual-regression bundles (#869)
650
- "__snapshots__", "snapshots", # Jest/Vitest snapshot dirs
651
- "storybook-static", # Storybook production build output
652
- "dist-protected", # Protected dist variants (same noise as dist)
653
- # Framework cache/build dirs — generated, never architecturally meaningful (#873)
654
- ".next", ".nuxt", ".turbo", ".angular",
655
- ".idea", ".cache", ".parcel-cache", ".svelte-kit", ".terraform", ".serverless",
656
- ".graphify", # graphify's own extraction cache — never index self-generated data
657
- ".worktrees", # git worktree convention (#947) — sibling checkouts, always redundant
658
- }
659
-
660
- # Large generated files that are never useful to extract
661
- _SKIP_FILES = {
662
- "package-lock.json", "yarn.lock", "pnpm-lock.yaml",
663
- "Cargo.lock", "poetry.lock", "Gemfile.lock",
664
- "composer.lock", "go.sum", "go.work.sum",
665
- }
666
-
667
- def _is_noise_dir(part: str, parent: "Path | None" = None) -> bool:
668
- """Return True if this directory name looks like a venv, cache, or dep dir."""
669
- if part in _SKIP_DIRS:
670
- return True
671
- # Catch *_venv, *_repo/site-packages patterns
672
- if part.endswith("_venv") or part.endswith("_env"):
673
- return True
674
- if part.endswith(".egg-info"):
675
- return True
676
- # worktrees/ nested inside a dotted dir (e.g. .claude/worktrees/, .git/worktrees/)
677
- if part == "worktrees" and parent is not None and parent.name.startswith("."):
678
- return True
679
- return False
680
-
681
-
682
- _VCS_MARKERS = (".git", ".hg", ".svn", "_darcs", ".fossil")
683
-
684
-
685
- def _parse_gitignore_line(raw: str) -> str:
686
- """Parse one raw line from a .graphifyignore file per gitignore spec.
687
-
688
- - Strip newline chars
689
- - Strip inline comments (whitespace + # suffix), but only when # is
690
- preceded by whitespace — so path#with#hash.py is preserved
691
- - Unescape \\# to literal #
692
- - Remove trailing spaces unless escaped with backslash
693
- - Strip leading whitespace
694
- - Return empty string for blank lines and full-line comments
695
- """
696
- line = raw.rstrip("\n\r")
697
- line = line.lstrip()
698
- if not line or line.startswith("#"):
699
- return ""
700
- # Strip inline comments: require whitespace before # (gitignore extension)
701
- line = re.sub(r"\s+#+[^\\].*$", "", line)
702
- # Unescape \# → literal #
703
- line = line.replace("\\#", "#")
704
- # Remove unescaped trailing spaces (per gitignore spec)
705
- line = re.sub(r"(?<!\\) +$", "", line)
706
- return line
707
-
708
-
709
- def _find_vcs_root(start: Path) -> Path | None:
710
- """Walk upward from start; return the first directory containing a VCS marker."""
711
- current = start.resolve()
712
- home = Path.home()
713
- while True:
714
- if any((current / m).exists() for m in _VCS_MARKERS):
715
- return current
716
- parent = current.parent
717
- if parent == current or current == home:
718
- return None
719
- current = parent
720
-
721
-
722
- def _load_graphifyignore(root: Path) -> list[tuple[Path, str]]:
723
- """Read .graphifyignore files and return (anchor_dir, pattern) pairs.
724
-
725
- Patterns are returned outer-first so that inner (closer) rules are
726
- appended last and win via last-match-wins semantics — matching gitignore
727
- behavior exactly.
728
-
729
- Walk ceiling: the nearest VCS root if inside a repo, otherwise the scan
730
- root itself (hermetic — no leakage across unrelated sibling projects).
731
- """
732
- root = root.resolve()
733
- ceiling = _find_vcs_root(root) or root
734
-
735
- # Collect ancestor dirs from ceiling down to root (outer → inner)
736
- dirs: list[Path] = []
737
- current = root
738
- while True:
739
- dirs.append(current)
740
- if current == ceiling:
741
- break
742
- current = current.parent
743
- dirs.reverse() # ceiling first, scan root last
744
-
745
- patterns: list[tuple[Path, str]] = []
746
- for d in dirs:
747
- # Prefer .graphifyignore; fall back to .gitignore so projects that already
748
- # maintain a .gitignore get sensible defaults without duplicating it (#945).
749
- ignore_file = d / ".graphifyignore"
750
- if not ignore_file.exists():
751
- ignore_file = d / ".gitignore"
752
- if ignore_file.exists():
753
- for raw in ignore_file.read_text(encoding="utf-8", errors="ignore").splitlines():
754
- line = _parse_gitignore_line(raw)
755
- if line:
756
- patterns.append((d, line))
757
- return patterns
758
-
759
-
760
- def _is_ignored(path: Path, root: Path, patterns: list[tuple[Path, str]]) -> bool:
761
- """Return True if the path should be ignored per .graphifyignore patterns.
762
-
763
- Uses gitignore last-match-wins semantics: all patterns are evaluated in
764
- order; the final matching pattern determines the result. Negation patterns
765
- (starting with !) un-ignore a previously ignored path.
766
-
767
- Enforces gitignore's parent-exclusion rule: a ! pattern cannot re-include
768
- a file whose ancestor directory is already excluded.
769
- """
770
- if not patterns:
771
- return False
772
-
773
- def _eval(target: Path) -> bool:
774
- """Apply last-match-wins to a single target path."""
775
- def _matches(rel: str, p: str, anchored: bool) -> bool:
776
- if anchored:
777
- return fnmatch.fnmatch(rel, p)
778
- parts = rel.split("/")
779
- if fnmatch.fnmatch(rel, p):
780
- return True
781
- if fnmatch.fnmatch(target.name, p):
782
- return True
783
- for i, part in enumerate(parts):
784
- if fnmatch.fnmatch(part, p):
785
- return True
786
- if fnmatch.fnmatch("/".join(parts[:i + 1]), p):
787
- return True
788
- return False
789
-
790
- result = False
791
- for anchor, pattern in patterns:
792
- negated = pattern.startswith("!")
793
- raw = pattern[1:] if negated else pattern
794
- anchored = raw.startswith("/")
795
- p = raw.strip("/")
796
- if not p:
797
- continue
798
-
799
- matched = False
800
- if anchored:
801
- try:
802
- rel_anchor = str(target.relative_to(anchor)).replace(os.sep, "/")
803
- matched = _matches(rel_anchor, p, anchored=True)
804
- except ValueError:
805
- pass
806
- else:
807
- try:
808
- rel = str(target.relative_to(root)).replace(os.sep, "/")
809
- matched = _matches(rel, p, anchored=False)
810
- except ValueError:
811
- pass
812
- if not matched and anchor != root:
813
- try:
814
- rel_anchor = str(target.relative_to(anchor)).replace(os.sep, "/")
815
- matched = _matches(rel_anchor, p, anchored=False)
816
- except ValueError:
817
- pass
818
-
819
- if matched:
820
- result = not negated # last match wins; ! flips to un-ignore
821
- return result
822
-
823
- # Gitignore parent-exclusion rule: a ! re-include cannot rescue a file
824
- # whose ancestor directory is already excluded. Walk ancestors top-down;
825
- # if any ancestor is excluded, the file is excluded regardless of later
826
- # ! patterns targeting the file or a sub-path.
827
- try:
828
- rel_parts = path.relative_to(root).parts
829
- except ValueError:
830
- return _eval(path)
831
-
832
- ancestor = root
833
- for part in rel_parts[:-1]:
834
- ancestor = ancestor / part
835
- if _eval(ancestor):
836
- return True
837
- return _eval(path)
838
-
839
-
840
- def _load_graphifyinclude(root: Path) -> list[tuple[Path, str]]:
841
- """Read .graphifyinclude allowlist patterns from root and ancestors.
842
-
843
- Include patterns opt matching hidden files/dirs into traversal. Sensitive
844
- files and hard-skipped noise directories are still excluded later.
845
- Uses the same VCS-root ceiling logic as _load_graphifyignore.
846
- """
847
- root = root.resolve()
848
- ceiling = _find_vcs_root(root) or root
849
-
850
- dirs: list[Path] = []
851
- current = root
852
- while True:
853
- dirs.append(current)
854
- if current == ceiling:
855
- break
856
- current = current.parent
857
- dirs.reverse()
858
-
859
- patterns: list[tuple[Path, str]] = []
860
- for d in dirs:
861
- include_file = d / ".graphifyinclude"
862
- if include_file.exists():
863
- for raw in include_file.read_text(encoding="utf-8", errors="ignore").splitlines():
864
- line = _parse_gitignore_line(raw)
865
- if line:
866
- patterns.append((d, line))
867
- return patterns
868
-
869
-
870
- def _is_included(path: Path, root: Path, patterns: list[tuple[Path, str]]) -> bool:
871
- """Return True if path matches any .graphifyinclude allowlist pattern."""
872
- if not patterns:
873
- return False
874
-
875
- def _matches(rel: str, p: str, anchored: bool) -> bool:
876
- if anchored:
877
- return fnmatch.fnmatch(rel, p)
878
- parts = rel.split("/")
879
- if fnmatch.fnmatch(rel, p):
880
- return True
881
- if fnmatch.fnmatch(path.name, p):
882
- return True
883
- for i, part in enumerate(parts):
884
- if fnmatch.fnmatch(part, p):
885
- return True
886
- if fnmatch.fnmatch("/".join(parts[:i + 1]), p):
887
- return True
888
- return False
889
-
890
- for anchor, pattern in patterns:
891
- anchored = pattern.startswith("/")
892
- p = pattern.strip("/")
893
- if not p:
894
- continue
895
- if anchored:
896
- try:
897
- rel_anchor = str(path.relative_to(anchor)).replace(os.sep, "/")
898
- if _matches(rel_anchor, p, anchored=True):
899
- return True
900
- except ValueError:
901
- pass
902
- else:
903
- try:
904
- rel = str(path.relative_to(root)).replace(os.sep, "/")
905
- if _matches(rel, p, anchored=False):
906
- return True
907
- except ValueError:
908
- pass
909
- if anchor != root:
910
- try:
911
- rel_anchor = str(path.relative_to(anchor)).replace(os.sep, "/")
912
- if _matches(rel_anchor, p, anchored=False):
913
- return True
914
- except ValueError:
915
- pass
916
- return False
917
-
918
-
919
- def _could_contain_included_path(path: Path, root: Path, patterns: list[tuple[Path, str]]) -> bool:
920
- """Return True if a directory may contain files matched by .graphifyinclude."""
921
- if not patterns:
922
- return False
923
-
924
- rels: list[str] = []
925
- try:
926
- rels.append(str(path.relative_to(root)).replace(os.sep, "/"))
927
- except ValueError:
928
- pass
929
- for anchor, _ in patterns:
930
- if anchor != root:
931
- try:
932
- rels.append(str(path.relative_to(anchor)).replace(os.sep, "/"))
933
- except ValueError:
934
- pass
935
-
936
- for rel in rels:
937
- rel = rel.strip("/")
938
- if not rel:
939
- return True
940
- for _, pattern in patterns:
941
- p = pattern.strip("/")
942
- if not p:
943
- continue
944
- if p == rel or p.startswith(rel + "/"):
945
- return True
946
- if fnmatch.fnmatch(rel, p):
947
- return True
948
- return False
949
-
950
-
951
- def _auto_follow_symlinks(root: Path) -> bool:
952
- """Auto-detect: ``True`` if ``root`` has any direct symlinked child.
953
-
954
- Allows "fake working dir" patterns (e.g. a folder full of symlinks pointing
955
- at scattered source dirs across the user's machine) to work transparently
956
- without the caller having to know to pass ``follow_symlinks=True``.
957
-
958
- Override is always possible by passing an explicit ``follow_symlinks=True``
959
- or ``follow_symlinks=False`` to :func:`detect` / :func:`detect_incremental`.
960
- """
961
- try:
962
- for p in root.iterdir():
963
- if p.is_symlink():
964
- return True
965
- except (OSError, PermissionError):
966
- pass
967
- return False
968
-
969
-
970
- def detect(root: Path, *, follow_symlinks: bool | None = None, google_workspace: bool | None = None, extra_excludes: list[str] | None = None) -> dict:
971
- root = root.resolve()
972
- if follow_symlinks is None:
973
- follow_symlinks = _auto_follow_symlinks(root)
974
- google_workspace = google_workspace_enabled() if google_workspace is None else google_workspace
975
- files: dict[FileType, list[str]] = {
976
- FileType.CODE: [],
977
- FileType.DOCUMENT: [],
978
- FileType.PAPER: [],
979
- FileType.IMAGE: [],
980
- FileType.VIDEO: [],
981
- }
982
- total_words = 0
983
-
984
- skipped_sensitive: list[str] = []
985
- ignore_patterns = _load_graphifyignore(root)
986
- # CLI --exclude patterns are anchored at the scan root and appended last
987
- # so they win over any .graphifyignore/.gitignore rules (#947).
988
- if extra_excludes:
989
- for pat in extra_excludes:
990
- line = _parse_gitignore_line(pat)
991
- if line:
992
- ignore_patterns.append((root, line))
993
- include_patterns = _load_graphifyinclude(root)
994
-
995
- # Always include graphify-out/memory/ - query results filed back into the graph
996
- memory_dir = root / "graphify-out" / "memory"
997
- scan_paths = [root]
998
- if memory_dir.exists():
999
- scan_paths.append(memory_dir)
1000
-
1001
- seen: set[Path] = set()
1002
- all_files: list[Path] = []
1003
-
1004
- for scan_root in scan_paths:
1005
- in_memory_tree = memory_dir.exists() and str(scan_root).startswith(str(memory_dir))
1006
- for dirpath, dirnames, filenames in os.walk(scan_root, followlinks=follow_symlinks):
1007
- dp = Path(dirpath)
1008
- if follow_symlinks and os.path.islink(dirpath):
1009
- real = os.path.realpath(dirpath)
1010
- parent_real = os.path.realpath(os.path.dirname(dirpath))
1011
- if parent_real == real or parent_real.startswith(real + os.sep):
1012
- dirnames.clear()
1013
- continue
1014
- if not in_memory_tree:
1015
- # Prune noise dirs in-place so os.walk never descends into them.
1016
- # Dot dirs are allowed — users often want .github/, .claude/, etc.
1017
- # Framework caches (.next, .nuxt, …) are caught by _is_noise_dir.
1018
- # When negation patterns (!) exist, skip directory-level ignore
1019
- # pruning so negated files inside can still be reached.
1020
- has_negation = any(p.startswith("!") for _, p in ignore_patterns)
1021
- dirnames[:] = [
1022
- d for d in dirnames
1023
- if not _is_noise_dir(d, dp)
1024
- and (has_negation or not _is_ignored(dp / d, root, ignore_patterns))
1025
- ]
1026
- for fname in filenames:
1027
- if fname in _SKIP_FILES:
1028
- continue
1029
- p = dp / fname
1030
- if p not in seen:
1031
- seen.add(p)
1032
- all_files.append(p)
1033
-
1034
- all_files.sort(key=lambda p: str(p))
1035
-
1036
- converted_dir = root / "graphify-out" / "converted"
1037
-
1038
- for p in all_files:
1039
- # For memory dir files, skip hidden/noise filtering
1040
- in_memory = memory_dir.exists() and str(p).startswith(str(memory_dir))
1041
- if not in_memory:
1042
- # Skip files inside our own converted/ dir (avoid re-processing sidecars)
1043
- if str(p).startswith(str(converted_dir)):
1044
- continue
1045
- if not in_memory and _is_ignored(p, root, ignore_patterns):
1046
- continue
1047
- if _is_sensitive(p):
1048
- skipped_sensitive.append(str(p))
1049
- continue
1050
- ftype = classify_file(p)
1051
- if ftype:
1052
- if p.suffix.lower() in GOOGLE_WORKSPACE_EXTENSIONS:
1053
- if not google_workspace:
1054
- skipped_sensitive.append(
1055
- str(p)
1056
- + " [Google Workspace shortcut skipped - pass --google-workspace "
1057
- "or set GRAPHIFY_GOOGLE_WORKSPACE=1]"
1058
- )
1059
- continue
1060
- try:
1061
- md_path = convert_google_workspace_file(p, converted_dir, xlsx_to_markdown=xlsx_to_markdown)
1062
- except Exception as exc:
1063
- skipped_sensitive.append(str(p) + f" [Google Workspace export failed: {exc}]")
1064
- continue
1065
- if md_path:
1066
- if _is_ignored(md_path, root, ignore_patterns):
1067
- continue
1068
- files[ftype].append(str(md_path))
1069
- total_words += count_words(md_path)
1070
- else:
1071
- skipped_sensitive.append(str(p) + " [Google Workspace export produced no readable text]")
1072
- continue
1073
- # Office files: convert to markdown sidecar so subagents can read them
1074
- if p.suffix.lower() in OFFICE_EXTENSIONS:
1075
- md_path = convert_office_file(p, converted_dir)
1076
- if md_path:
1077
- if _is_ignored(md_path, root, ignore_patterns):
1078
- continue
1079
- files[ftype].append(str(md_path))
1080
- total_words += count_words(md_path)
1081
- else:
1082
- # Conversion failed (library not installed) - skip with note
1083
- skipped_sensitive.append(str(p) + " [office conversion failed - pip install graphifyy[office]]")
1084
- continue
1085
- files[ftype].append(str(p))
1086
- if ftype != FileType.VIDEO:
1087
- total_words += count_words(p)
1088
-
1089
- for ftype in files:
1090
- files[ftype].sort()
1091
-
1092
- total_files = sum(len(v) for v in files.values())
1093
- needs_graph = total_words >= CORPUS_WARN_THRESHOLD
1094
-
1095
- # Determine warning - lower bound, upper bound, or sensitive files skipped
1096
- warning: str | None = None
1097
- if not needs_graph:
1098
- warning = (
1099
- f"Corpus is ~{total_words:,} words - fits in a single context window. "
1100
- f"You may not need a graph."
1101
- )
1102
- elif total_words >= CORPUS_UPPER_THRESHOLD or total_files >= FILE_COUNT_UPPER:
1103
- warning = (
1104
- f"Large corpus: {total_files} files · ~{total_words:,} words. "
1105
- f"Semantic extraction will be expensive (many Claude tokens). "
1106
- f"Consider running on a subfolder."
1107
- )
1108
-
1109
- return {
1110
- "files": {k.value: v for k, v in files.items()},
1111
- "total_files": total_files,
1112
- "total_words": total_words,
1113
- "needs_graph": needs_graph,
1114
- "warning": warning,
1115
- "skipped_sensitive": skipped_sensitive,
1116
- "graphifyignore_patterns": len(ignore_patterns),
1117
- "scan_root": str(root.resolve()),
1118
- }
1119
-
1120
-
1121
- def _md5_file(path: Path) -> str:
1122
- """MD5 of file contents streamed in 64KB chunks — for change detection only."""
1123
- import hashlib as _hl
1124
- h = _hl.md5(usedforsecurity=False)
1125
- try:
1126
- with path.open("rb") as f:
1127
- for chunk in iter(lambda: f.read(65536), b""):
1128
- h.update(chunk)
1129
- except OSError:
1130
- return ""
1131
- return h.hexdigest()
1132
-
1133
-
1134
- def _to_relative_for_storage(key: str, root: Path) -> str:
1135
- """Return ``key`` as a forward-slash relative path from ``root``.
1136
-
1137
- Keys outside ``root`` (out-of-tree symlinked sources, external --include
1138
- paths) and already-relative keys pass through unchanged — mirrors the
1139
- fallback in :func:`graphify.watch._relativize_source_files` so the
1140
- on-disk artifact survives the round-trip even when some paths cannot be
1141
- portably encoded.
1142
-
1143
- Only ``root`` is resolved — the key itself is relativized symbolically
1144
- so an in-root symlink (e.g. ``alias.py -> sub/target.py``) is stored
1145
- under its own name. Resolving the key would point the stored entry at
1146
- the symlink target, and the original key would then miss on reload and
1147
- re-extract on every incremental run.
1148
- """
1149
- p = Path(key)
1150
- if not p.is_absolute():
1151
- return key
1152
- try:
1153
- rel = os.path.relpath(p, Path(root).resolve())
1154
- except (ValueError, OSError):
1155
- return key # outside root (e.g. Windows cross-drive)
1156
- # ``os.path.relpath`` happily produces ``../foo`` for paths outside
1157
- # root; mirror the prior ``relative_to``-raises-ValueError semantics by
1158
- # keeping out-of-root entries in their absolute form.
1159
- if rel == ".." or rel.startswith(".." + os.sep) or rel.startswith("../"):
1160
- return key
1161
- return rel.replace(os.sep, "/")
1162
-
1163
-
1164
- def _to_absolute_from_storage(key: str, root: Path) -> str:
1165
- """Inverse of :func:`_to_relative_for_storage`.
1166
-
1167
- Re-anchor a stored key against ``root``. Already-absolute keys
1168
- (legacy manifests, out-of-root entries) pass through unchanged so
1169
- that newly-loaded manifests from before this change remain readable.
1170
- Uses ``Path(root).resolve()`` so the produced absolute path matches
1171
- what :func:`detect` returns (which also resolves the scan root).
1172
- """
1173
- p = Path(key)
1174
- if p.is_absolute():
1175
- return str(p)
1176
- return str(Path(root).resolve() / p)
1177
-
1178
-
1179
- def load_manifest(
1180
- manifest_path: str = _MANIFEST_PATH,
1181
- *,
1182
- root: Path | None = None,
1183
- ) -> dict:
1184
- """Load the manifest from a previous run. Returns {} on any error.
1185
-
1186
- When ``root`` is provided, stored relative keys are re-anchored against
1187
- it so callers see absolute paths regardless of on-disk format. Legacy
1188
- manifests with absolute keys pass through unchanged, so a graphify-out/
1189
- written by an older version (or by a caller that didn't supply ``root``
1190
- to :func:`save_manifest`) remains readable.
1191
- """
1192
- try:
1193
- raw = json.loads(Path(manifest_path).read_text(encoding="utf-8"))
1194
- except Exception:
1195
- return {}
1196
- if root is None or not isinstance(raw, dict):
1197
- return raw
1198
- return {_to_absolute_from_storage(k, root): v for k, v in raw.items()}
1199
-
1200
-
1201
- def save_manifest(
1202
- files: dict[str, list[str]],
1203
- manifest_path: str = _MANIFEST_PATH,
1204
- *,
1205
- kind: str = "both",
1206
- root: Path | None = None,
1207
- ) -> None:
1208
- """Save current file mtimes + content hashes for change detection.
1209
-
1210
- kind="ast" — written by `graphify update` (AST-only rebuild). Stamps
1211
- ast_hash; preserves an existing semantic_hash only when
1212
- the file content is unchanged (mtime + hash match).
1213
- kind="semantic" — written by `graphify extract` after semantic extraction.
1214
- Stamps semantic_hash; preserves existing ast_hash.
1215
- kind="both" — full pipeline: stamps both hashes (default).
1216
-
1217
- When ``root`` is provided, keys are relativized against it before write
1218
- (forward-slash, posix-style) so the on-disk manifest is portable across
1219
- machines and checkout locations (#777). Out-of-root entries are written
1220
- as absolute so they continue to round-trip on the saving machine.
1221
- When ``root`` is None the legacy absolute-keyed format is preserved.
1222
- """
1223
- existing = load_manifest(manifest_path, root=root)
1224
-
1225
- def _normalise_entry(entry):
1226
- if isinstance(entry, (int, float)):
1227
- return {"mtime": entry, "ast_hash": "", "semantic_hash": ""}
1228
- if isinstance(entry, dict) and "hash" in entry and "ast_hash" not in entry:
1229
- return {"mtime": entry.get("mtime", 0), "ast_hash": entry["hash"], "semantic_hash": ""}
1230
- if isinstance(entry, dict):
1231
- return entry
1232
- return None
1233
-
1234
- # Seed from the existing manifest so incremental callers passing a subset
1235
- # of files don't silently erase entries for untouched files (#917).
1236
- # Prune entries whose file no longer exists on disk — those are genuine
1237
- # deletions that detect_incremental() should treat as gone.
1238
- manifest: dict[str, dict] = {}
1239
- for f, entry in existing.items():
1240
- normalised = _normalise_entry(entry)
1241
- if normalised is None:
1242
- continue
1243
- try:
1244
- if Path(f).exists():
1245
- manifest[f] = normalised
1246
- except OSError:
1247
- continue
1248
-
1249
- for file_list in files.values():
1250
- for f in file_list:
1251
- try:
1252
- p = Path(f)
1253
- mtime = p.stat().st_mtime
1254
- h = _md5_file(p)
1255
- except OSError:
1256
- continue # file deleted between detect() and manifest write
1257
- prev = _normalise_entry(existing.get(f, {})) or {}
1258
- entry: dict = {"mtime": mtime}
1259
- if kind in ("ast", "both"):
1260
- entry["ast_hash"] = h
1261
- else:
1262
- entry["ast_hash"] = prev.get("ast_hash", "")
1263
- if kind in ("semantic", "both"):
1264
- entry["semantic_hash"] = h
1265
- else:
1266
- # Preserve semantic_hash only when content is unchanged
1267
- entry["semantic_hash"] = prev.get("semantic_hash", "") if h == prev.get("ast_hash", "") else ""
1268
- manifest[f] = entry
1269
- if root is not None:
1270
- # Persist in portable form: forward-slash relative paths. Keys outside
1271
- # ``root`` (out-of-tree symlinked corpora, --include sources) keep
1272
- # their absolute form so the manifest round-trips on the saving
1273
- # machine even when not every entry can be portably encoded.
1274
- manifest = {_to_relative_for_storage(k, root): v for k, v in manifest.items()}
1275
- Path(manifest_path).parent.mkdir(parents=True, exist_ok=True)
1276
- Path(manifest_path).write_text(json.dumps(manifest, indent=2), encoding="utf-8")
1277
-
1278
-
1279
- def detect_incremental(
1280
- root: Path,
1281
- manifest_path: str = _MANIFEST_PATH,
1282
- *,
1283
- follow_symlinks: bool | None = None,
1284
- google_workspace: bool | None = None,
1285
- kind: str = "semantic",
1286
- extra_excludes: list[str] | None = None,
1287
- ) -> dict:
1288
- """Like detect(), but returns only new or modified files since the last run.
1289
-
1290
- kind="semantic" (default for extract): a file is "changed" when its
1291
- semantic_hash is missing or its content has changed since the last
1292
- semantic extraction pass. Use this for `graphify extract` so that
1293
- files touched by `graphify update` (AST-only) are re-extracted
1294
- semantically.
1295
- kind="ast": a file is "changed" when its ast_hash is missing or its
1296
- content has changed. Use this for `graphify update`.
1297
-
1298
- Fast path: mtime unchanged + hash matches → unchanged (free, no disk IO
1299
- beyond stat). Slow path: mtime bumped → compare MD5 against the relevant
1300
- hash field before re-extracting.
1301
-
1302
- Backwards compatible with legacy manifests storing plain float mtime values
1303
- or {mtime, hash} dicts (treated as ast_hash only; semantic_hash = miss).
1304
-
1305
- The ``follow_symlinks`` flag is forwarded to :func:`detect` so corpora that
1306
- rely on symlinked sub-trees (e.g. a ``state_of_truth/`` symlink pointing to a
1307
- directory outside the scan root) are scanned consistently between full and
1308
- incremental runs. ``None`` (default) means auto-detect: ``True`` when ``root``
1309
- contains at least one direct symlinked child, ``False`` otherwise.
1310
- """
1311
- full = detect(root, follow_symlinks=follow_symlinks, google_workspace=google_workspace, extra_excludes=extra_excludes)
1312
- # Pass ``root`` so a manifest written with relative keys (post-#777) is
1313
- # re-anchored to the absolute form the rest of this function compares
1314
- # against. Legacy absolute-keyed manifests pass through unchanged.
1315
- manifest = load_manifest(manifest_path, root=root)
1316
-
1317
- if not manifest:
1318
- # No previous run - treat everything as new
1319
- full["incremental"] = True
1320
- full["new_files"] = full["files"]
1321
- full["unchanged_files"] = {k: [] for k in full["files"]}
1322
- full["new_total"] = full["total_files"]
1323
- return full
1324
-
1325
- new_files: dict[str, list[str]] = {k: [] for k in full["files"]}
1326
- unchanged_files: dict[str, list[str]] = {k: [] for k in full["files"]}
1327
-
1328
- for ftype, file_list in full["files"].items():
1329
- for f in file_list:
1330
- stored = manifest.get(f)
1331
- try:
1332
- current_mtime = Path(f).stat().st_mtime
1333
- except Exception:
1334
- current_mtime = 0
1335
-
1336
- # Legacy manifest: plain float value — treat as ast_hash only
1337
- if isinstance(stored, (int, float)):
1338
- changed = stored is None or current_mtime > stored
1339
- elif isinstance(stored, dict):
1340
- # Normalise legacy {mtime, hash} to new schema
1341
- if "hash" in stored and "ast_hash" not in stored:
1342
- stored = {"mtime": stored.get("mtime", 0), "ast_hash": stored["hash"], "semantic_hash": ""}
1343
- hash_key = "semantic_hash" if kind == "semantic" else "ast_hash"
1344
- stored_hash = stored.get(hash_key, "")
1345
- # Missing semantic_hash means update ran but extract hasn't — always re-extract
1346
- if not stored_hash:
1347
- changed = True
1348
- else:
1349
- stored_mtime = stored.get("mtime")
1350
- # Schema-drift guard (#1163): tolerate a nested {mtime: ...}
1351
- # dict or any non-numeric value without crashing.
1352
- if isinstance(stored_mtime, dict):
1353
- stored_mtime = stored_mtime.get("mtime")
1354
- if not isinstance(stored_mtime, (int, float)):
1355
- stored_mtime = None
1356
- if stored_mtime is None or current_mtime != stored_mtime:
1357
- # mtime bumped — verify with content hash before re-extracting
1358
- changed = _md5_file(Path(f)) != stored_hash
1359
- else:
1360
- changed = False
1361
- else:
1362
- changed = True # unknown format, re-extract to be safe
1363
-
1364
- if changed:
1365
- new_files[ftype].append(f)
1366
- else:
1367
- unchanged_files[ftype].append(f)
1368
-
1369
- # Files in manifest that no longer exist - their cached nodes are now ghost nodes
1370
- current_files = {f for flist in full["files"].values() for f in flist}
1371
- deleted_files = [f for f in manifest if f not in current_files]
1372
-
1373
- new_total = sum(len(v) for v in new_files.values())
1374
- full["incremental"] = True
1375
- full["new_files"] = new_files
1376
- full["unchanged_files"] = unchanged_files
1377
- full["new_total"] = new_total
1378
- full["deleted_files"] = deleted_files
1379
- return full