1bcoder 0.1.12__tar.gz → 0.1.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. {1bcoder-0.1.12 → 1bcoder-0.1.13}/1bcoder.egg-info/PKG-INFO +1 -1
  2. {1bcoder-0.1.12 → 1bcoder-0.1.13}/1bcoder.egg-info/SOURCES.txt +2 -0
  3. {1bcoder-0.1.12 → 1bcoder-0.1.13}/PKG-INFO +1 -1
  4. 1bcoder-0.1.13/_bcoder_data/flows/__pycache__/webcrawl.cpython-311.pyc +0 -0
  5. 1bcoder-0.1.13/_bcoder_data/flows/webcrawl.py +369 -0
  6. {1bcoder-0.1.12 → 1bcoder-0.1.13}/map_query.py +22 -6
  7. {1bcoder-0.1.12 → 1bcoder-0.1.13}/pyproject.toml +1 -1
  8. {1bcoder-0.1.12 → 1bcoder-0.1.13}/1bcoder.egg-info/dependency_links.txt +0 -0
  9. {1bcoder-0.1.12 → 1bcoder-0.1.13}/1bcoder.egg-info/entry_points.txt +0 -0
  10. {1bcoder-0.1.12 → 1bcoder-0.1.13}/1bcoder.egg-info/requires.txt +0 -0
  11. {1bcoder-0.1.12 → 1bcoder-0.1.13}/1bcoder.egg-info/top_level.txt +0 -0
  12. {1bcoder-0.1.12 → 1bcoder-0.1.13}/LICENSE +0 -0
  13. {1bcoder-0.1.12 → 1bcoder-0.1.13}/README.md +0 -0
  14. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/__init__.py +0 -0
  15. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/agents/advance.txt +0 -0
  16. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/agents/ask.txt +0 -0
  17. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/agents/compact.txt +0 -0
  18. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/agents/concepts.txt +0 -0
  19. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/agents/fill.txt +0 -0
  20. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/agents/planning.txt +0 -0
  21. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/agents/scan.txt +0 -0
  22. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/agents/sqlite.txt +0 -0
  23. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/agents/websearch.txt +0 -0
  24. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/aliases.txt +0 -0
  25. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/doc/FLOWS.md +0 -0
  26. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/doc/MCP.md +0 -0
  27. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/doc/OLLAMA_SERVER_PARAM.md +0 -0
  28. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/doc/PARAM.md +0 -0
  29. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/doc/PROC.md +0 -0
  30. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/doc/TRANSLATE.md +0 -0
  31. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/flows/__pycache__/commit_message.cpython-311.pyc +0 -0
  32. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/flows/commit_message.py +0 -0
  33. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/flows/deepagent.py +0 -0
  34. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/flows/grounding.py +0 -0
  35. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/flows/py_error_trace.py +0 -0
  36. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/flows/simargl_files.py +0 -0
  37. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/flows/visual_search.py +0 -0
  38. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/flows/webask.py +0 -0
  39. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/map.txt +0 -0
  40. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/action-required.py +0 -0
  41. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/add-save.py +0 -0
  42. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/assist.py +0 -0
  43. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/collect-files.py +0 -0
  44. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/ctx_cut.py +0 -0
  45. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/extract-code.py +0 -0
  46. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/extract-files.py +0 -0
  47. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/extract-list.py +0 -0
  48. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/grounding-check.py +0 -0
  49. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/md.py +0 -0
  50. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/mdx.py +0 -0
  51. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/pattern-gate.py +0 -0
  52. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/regexp-extract.py +0 -0
  53. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/rude_words.py +0 -0
  54. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/scan-save.py +0 -0
  55. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/secret_check.py +0 -0
  56. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/sql_readonly_guard.py +0 -0
  57. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/tempctx-cut.py +0 -0
  58. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/profiles.txt +0 -0
  59. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/prompts/analysis.txt +0 -0
  60. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/prompts/sumarise.txt +0 -0
  61. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/prompts.txt +0 -0
  62. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/AddFunction.txt +0 -0
  63. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/AskProject.txt +0 -0
  64. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/CheckRequirements.txt +0 -0
  65. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/DockerMySQL.txt +0 -0
  66. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/DockerNginx.txt +0 -0
  67. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/DockerPython.txt +0 -0
  68. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/DockerStack.txt +0 -0
  69. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/DuckDuckGoInstant.txt +0 -0
  70. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/EnvTemplate.txt +0 -0
  71. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/Explain.txt +0 -0
  72. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/ExploreProjectStructure.txt +0 -0
  73. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/GitIgnorePython.txt +0 -0
  74. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/MySQLDump.txt +0 -0
  75. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/NewScript.txt +0 -0
  76. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/PipFreeze.txt +0 -0
  77. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/PyPI.txt +0 -0
  78. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/Refactor.txt +0 -0
  79. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/RunAndFix.txt +0 -0
  80. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/SQLiteSchema.txt +0 -0
  81. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/Translate.txt +0 -0
  82. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/WikiPage.txt +0 -0
  83. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/WikiSearch.txt +0 -0
  84. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/auto-bkup.txt +0 -0
  85. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/edit-control.txt +0 -0
  86. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/parallel_call.txt +0 -0
  87. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/personal/content/create-regular-content.txt +0 -0
  88. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/personal/content/plan.txt +0 -0
  89. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/personal/test/collect-data-from-test-environment.txt +0 -0
  90. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/plan.txt +0 -0
  91. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/remote/create-content-on-remote-server.txt +0 -0
  92. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/set_ctx.txt +0 -0
  93. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/simargl-cli_index_files.txt +0 -0
  94. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/simargl-cli_index_units.txt +0 -0
  95. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/simargl-cli_search.txt +0 -0
  96. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/team-map-worker.txt +0 -0
  97. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/team-search-worker.txt +0 -0
  98. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/team-summarize.txt +0 -0
  99. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/team-tree-worker.txt +0 -0
  100. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/test.txt +0 -0
  101. {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/teams/code-analysis.yaml +0 -0
  102. {1bcoder-0.1.12 → 1bcoder-0.1.13}/chat.py +0 -0
  103. {1bcoder-0.1.12 → 1bcoder-0.1.13}/map_index.py +0 -0
  104. {1bcoder-0.1.12 → 1bcoder-0.1.13}/setup.cfg +0 -0
  105. {1bcoder-0.1.12 → 1bcoder-0.1.13}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: 1bcoder
3
- Version: 0.1.12
3
+ Version: 0.1.13
4
4
  Summary: AI coding assistant agent for 1B–7B local models (Ollama, LMStudio, llama.cpp). Terminal REPL with file editing, project map, agents, scripts, and parallel multi-model queries.
5
5
  Project-URL: Homepage, https://github.com/szholobetsky/1bcoder
6
6
  Project-URL: Repository, https://github.com/szholobetsky/1bcoder
@@ -37,7 +37,9 @@ _bcoder_data/flows/py_error_trace.py
37
37
  _bcoder_data/flows/simargl_files.py
38
38
  _bcoder_data/flows/visual_search.py
39
39
  _bcoder_data/flows/webask.py
40
+ _bcoder_data/flows/webcrawl.py
40
41
  _bcoder_data/flows/__pycache__/commit_message.cpython-311.pyc
42
+ _bcoder_data/flows/__pycache__/webcrawl.cpython-311.pyc
41
43
  _bcoder_data/proc/action-required.py
42
44
  _bcoder_data/proc/add-save.py
43
45
  _bcoder_data/proc/assist.py
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: 1bcoder
3
- Version: 0.1.12
3
+ Version: 0.1.13
4
4
  Summary: AI coding assistant agent for 1B–7B local models (Ollama, LMStudio, llama.cpp). Terminal REPL with file editing, project map, agents, scripts, and parallel multi-model queries.
5
5
  Project-URL: Homepage, https://github.com/szholobetsky/1bcoder
6
6
  Project-URL: Repository, https://github.com/szholobetsky/1bcoder
@@ -0,0 +1,369 @@
1
+ """Crawl URLs, save pages or extract structured data.
2
+
3
+ Modes:
4
+ extract (default) — XPath columns → CSV
5
+ pages — each page as .txt file (URL structure preserved)
6
+ combine — all pages as one .txt file
7
+ mirror — save .html files with relative links
8
+
9
+ Usage:
10
+ /flow webcrawl <url> name:"//h1/text()" price:"//span/text()" [--out result.csv]
11
+ /flow webcrawl <url> --columns cols.yaml [--out result.csv]
12
+ /flow webcrawl <url> --mode pages --out ./pages/ [--depth 2]
13
+ /flow webcrawl <url> --mode combine --out all.txt [--depth 2]
14
+ /flow webcrawl <url> --mode mirror --out ./mirror/ [--depth 2]
15
+ /flow webcrawl <url> --mode pages --filter mysite.com/docs --out ./docs/
16
+
17
+ Add --ask to any mode for LLM summary after completion.
18
+
19
+ --columns YAML format (cols.yaml):
20
+ name: "//h1/text()"
21
+ price: "//span[@class='price']/text()"
22
+ sku: "//*[@class='sku']/text()"
23
+ category: "//nav[@class='breadcrumb']/a[last()]/text()"
24
+
25
+ Note: use single quotes inside XPath for attribute values — @class='price'
26
+ Each column runs its XPath on every page; rows are zipped across columns.
27
+ """
28
+ import re as _re
29
+ import os as _os
30
+ import csv as _csv
31
+ from collections import deque as _deque
32
+ from itertools import zip_longest as _zip_longest
33
+ from urllib.request import urlopen as _urlopen, Request as _Request
34
+ from urllib.parse import urljoin as _urljoin, urlparse as _urlparse
35
+
36
+
37
+ # ---------------------------------------------------------------------------
38
+ # HTTP
39
+ # ---------------------------------------------------------------------------
40
+
41
+ def _fetch(url: str, timeout: int = 12) -> bytes | None:
42
+ try:
43
+ req = _Request(url, headers={"User-Agent": "Mozilla/5.0"})
44
+ with _urlopen(req, timeout=timeout) as r:
45
+ ct = r.headers.get("Content-Type", "")
46
+ if "html" not in ct and "xml" not in ct and "text" not in ct:
47
+ return None
48
+ return r.read()
49
+ except Exception as e:
50
+ print(f"[webcrawl] skip {url}: {e}")
51
+ return None
52
+
53
+
54
+ def _extract_links(tree, base_url: str, domain: str) -> list[str]:
55
+ links = []
56
+ for href in tree.xpath("//a/@href"):
57
+ abs_url = _urljoin(base_url, href)
58
+ p = _urlparse(abs_url)
59
+ if p.scheme in ("http", "https") and p.netloc == domain:
60
+ links.append(abs_url.split("#")[0])
61
+ return links
62
+
63
+
64
+ # ---------------------------------------------------------------------------
65
+ # HTML → plain text
66
+ # ---------------------------------------------------------------------------
67
+
68
+ def _to_text(tree) -> str:
69
+ for tag in tree.xpath("//script|//style|//nav|//header|//footer|//aside"):
70
+ p = tag.getparent()
71
+ if p is not None:
72
+ p.remove(tag)
73
+ body = tree.find(".//body")
74
+ raw = (body if body is not None else tree).text_content()
75
+ # collapse whitespace
76
+ lines = [ln.strip() for ln in raw.splitlines()]
77
+ return "\n".join(ln for ln in lines if ln)
78
+
79
+
80
+ # ---------------------------------------------------------------------------
81
+ # URL → local file path
82
+ # ---------------------------------------------------------------------------
83
+
84
+ def _url_to_relpath(url: str, ext: str) -> str:
85
+ path = _urlparse(url).path.strip("/")
86
+ if not path:
87
+ return "index" + ext
88
+ if path.endswith("/"):
89
+ path = path.rstrip("/") + "/index"
90
+ root, _ = _os.path.splitext(path)
91
+ return root + ext
92
+
93
+
94
+ # ---------------------------------------------------------------------------
95
+ # Argument parsing helpers
96
+ # ---------------------------------------------------------------------------
97
+
98
+ def _parse_inline_columns(args: str) -> dict[str, str]:
99
+ """Parse name:"//xpath" pairs — XPath must use single quotes for attributes."""
100
+ return {m.group(1): m.group(2)
101
+ for m in _re.finditer(r'(\w+):"([^"]*)"', args)}
102
+
103
+
104
+ def _load_yaml_columns(path: str) -> dict[str, str]:
105
+ try:
106
+ import yaml as _yaml
107
+ with open(path, encoding="utf-8") as f:
108
+ return _yaml.safe_load(f)
109
+ except ImportError:
110
+ # fallback: naive key: "value" parser
111
+ result = {}
112
+ with open(path, encoding="utf-8") as f:
113
+ for line in f:
114
+ m = _re.match(r'\s*(\w+)\s*:\s*["\']?(.+?)["\']?\s*$', line)
115
+ if m:
116
+ result[m.group(1)] = m.group(2).strip('"\'')
117
+ return result
118
+
119
+
120
+ def _strip_flags(args: str) -> str:
121
+ for pat in (r'\w+:"[^"]*"', r"--columns\s+\S+", r"--mode\s+\S+",
122
+ r"--depth\s+\d+", r"--out\s+\S+", r"--filter\s+\S+", r"--ask"):
123
+ args = _re.sub(pat, "", args)
124
+ return args.strip()
125
+
126
+
127
+ # ---------------------------------------------------------------------------
128
+ # BFS crawler
129
+ # ---------------------------------------------------------------------------
130
+
131
+ def _normalize_filter(f: str) -> str:
132
+ """Ensure filter has scheme and no trailing slash."""
133
+ if not f.startswith("http"):
134
+ f = "https://" + f
135
+ return f.rstrip("/")
136
+
137
+
138
+ def _crawl(start_url: str, max_depth: int, url_filter: str | None = None):
139
+ """Yield (url, depth, raw_bytes, lxml_tree) for each reachable page."""
140
+ try:
141
+ from lxml import etree as _et
142
+ except ImportError:
143
+ raise ImportError("lxml not installed — run: pip install lxml")
144
+
145
+ prefix = _normalize_filter(url_filter) if url_filter else None
146
+ domain = _urlparse(start_url).netloc
147
+ queue = _deque([(start_url, 0)])
148
+ visited: set[str] = set()
149
+
150
+ while queue:
151
+ url, depth = queue.popleft()
152
+ if url in visited:
153
+ continue
154
+ if prefix and not url.startswith(prefix):
155
+ continue
156
+ visited.add(url)
157
+
158
+ print(f"[webcrawl] ({depth}/{max_depth}) {url}")
159
+ raw = _fetch(url)
160
+ if raw is None:
161
+ continue
162
+
163
+ try:
164
+ tree = _et.fromstring(raw, _et.HTMLParser())
165
+ except Exception as e:
166
+ print(f"[webcrawl] parse error {url}: {e}")
167
+ continue
168
+
169
+ yield url, depth, raw, tree
170
+
171
+ if depth < max_depth:
172
+ for link in _extract_links(tree, url, domain):
173
+ if link not in visited:
174
+ queue.append((link, depth + 1))
175
+
176
+
177
+ # ---------------------------------------------------------------------------
178
+ # Modes
179
+ # ---------------------------------------------------------------------------
180
+
181
+ def _mode_extract(start_url, max_depth, columns, out_path, url_filter=None):
182
+ rows = []
183
+ col_names = list(columns.keys())
184
+ xpaths = list(columns.values())
185
+
186
+ for url, depth, raw, tree in _crawl(start_url, max_depth, url_filter):
187
+ results = []
188
+ for xp in xpaths:
189
+ nodes = tree.xpath(xp)
190
+ vals = []
191
+ for n in nodes:
192
+ text = n.text_content().strip() if hasattr(n, "text_content") else str(n).strip()
193
+ if text:
194
+ vals.append(text)
195
+ results.append(vals)
196
+
197
+ for values in _zip_longest(*results, fillvalue=""):
198
+ rows.append((url,) + tuple(values))
199
+
200
+ with open(out_path, "w", newline="", encoding="utf-8") as f:
201
+ w = _csv.writer(f)
202
+ w.writerow(["url"] + col_names)
203
+ w.writerows(rows)
204
+
205
+ print(f"[webcrawl] extract done — {len(rows)} rows → {out_path}")
206
+ return rows
207
+
208
+
209
+ def _mode_pages(start_url, max_depth, out_dir, url_filter=None):
210
+ _os.makedirs(out_dir, exist_ok=True)
211
+ count = 0
212
+ for url, depth, raw, tree in _crawl(start_url, max_depth, url_filter):
213
+ rel = _url_to_relpath(url, ".txt")
214
+ dest = _os.path.join(out_dir, rel)
215
+ _os.makedirs(_os.path.dirname(dest), exist_ok=True)
216
+ with open(dest, "w", encoding="utf-8") as f:
217
+ f.write(f"URL: {url}\n\n")
218
+ f.write(_to_text(tree))
219
+ count += 1
220
+ print(f"[webcrawl] pages done — {count} files → {out_dir}")
221
+ return count
222
+
223
+
224
+ def _mode_combine(start_url, max_depth, out_path, url_filter=None):
225
+ parts = []
226
+ for url, depth, raw, tree in _crawl(start_url, max_depth, url_filter):
227
+ text = _to_text(tree)
228
+ parts.append(f"=== {url} ===\n\n{text}")
229
+ with open(out_path, "w", encoding="utf-8") as f:
230
+ f.write("\n\n" + ("─" * 60) + "\n\n".join(parts))
231
+ print(f"[webcrawl] combine done — {len(parts)} pages → {out_path}")
232
+ return parts
233
+
234
+
235
+ def _mode_mirror(start_url, max_depth, out_dir, url_filter=None):
236
+ from lxml import etree as _et
237
+
238
+ _os.makedirs(out_dir, exist_ok=True)
239
+
240
+ # pass 1 — crawl, collect pages, build url→local_file index
241
+ pages: list[tuple[str, bytes, object]] = []
242
+ url_to_file: dict[str, str] = {}
243
+ for url, depth, raw, tree in _crawl(start_url, max_depth, url_filter):
244
+ rel = _url_to_relpath(url, ".html")
245
+ dest = _os.path.join(out_dir, rel)
246
+ pages.append((url, raw, tree))
247
+ url_to_file[url] = dest
248
+
249
+ # pass 2 — rewrite links, save
250
+ for url, raw, tree in pages:
251
+ dest = url_to_file[url]
252
+ from_dir = _os.path.dirname(dest)
253
+
254
+ # remove <base href> — it overrides all relative links and points back to original site
255
+ for base in tree.xpath("//base"):
256
+ p = base.getparent()
257
+ if p is not None:
258
+ p.remove(base)
259
+
260
+ for a in tree.xpath("//a[@href]"):
261
+ href = a.get("href", "")
262
+ abs_href = _urljoin(url, href).split("#")[0]
263
+ if abs_href in url_to_file:
264
+ to_file = url_to_file[abs_href]
265
+ rel_path = _os.path.relpath(to_file, from_dir).replace("\\", "/")
266
+ a.set("href", rel_path)
267
+
268
+ _os.makedirs(from_dir, exist_ok=True)
269
+ html_bytes = _et.tostring(tree, method="html", encoding="unicode").encode("utf-8")
270
+ with open(dest, "wb") as f:
271
+ f.write(html_bytes)
272
+
273
+ print(f"[webcrawl] mirror done — {len(pages)} html files → {out_dir}")
274
+ return len(pages)
275
+
276
+
277
+ # ---------------------------------------------------------------------------
278
+ # Entry point
279
+ # ---------------------------------------------------------------------------
280
+
281
+ def run(chat, args: str):
282
+ mode_m = _re.search(r"--mode\s+(\S+)", args)
283
+ depth_m = _re.search(r"--depth\s+(\d+)", args)
284
+ out_m = _re.search(r"--out\s+(\S+)", args)
285
+ columns_m = _re.search(r"--columns\s+(\S+)", args)
286
+ filter_m = _re.search(r"--filter\s+(\S+)", args)
287
+ ask = "--ask" in args
288
+
289
+ mode = mode_m.group(1) if mode_m else "extract"
290
+ max_depth = int(depth_m.group(1)) if depth_m else 2
291
+ url_filter = filter_m.group(1) if filter_m else None
292
+ ask_summary = ask
293
+
294
+ if url_filter:
295
+ print(f"[webcrawl] filter: only pages under {_normalize_filter(url_filter)}")
296
+
297
+ start_url = _strip_flags(args)
298
+ if not start_url.startswith("http"):
299
+ print(
300
+ "usage:\n"
301
+ " /flow webcrawl <url> name:\"//h1/text()\" price:\"//span/text()\" [--out result.csv]\n"
302
+ " /flow webcrawl <url> --columns cols.yaml [--out result.csv]\n"
303
+ " /flow webcrawl <url> --mode pages --out ./pages/\n"
304
+ " /flow webcrawl <url> --mode combine --out all.txt\n"
305
+ " /flow webcrawl <url> --mode mirror --out ./mirror/"
306
+ )
307
+ return
308
+
309
+ try:
310
+ from lxml import etree # noqa: F401
311
+ except ImportError:
312
+ print("[webcrawl] lxml not installed — run: pip install lxml")
313
+ return
314
+
315
+ # ---- dispatch ----
316
+
317
+ if mode == "extract":
318
+ if columns_m:
319
+ columns = _load_yaml_columns(columns_m.group(1))
320
+ else:
321
+ columns = _parse_inline_columns(args)
322
+ if not columns:
323
+ print("[webcrawl] extract mode requires columns — inline or --columns file.yaml")
324
+ return
325
+ out_path = out_m.group(1) if out_m else "crawl_result.csv"
326
+ result = _mode_extract(start_url, max_depth, columns, out_path, url_filter)
327
+ summary_hint = f"{len(result)} rows extracted to {out_path}"
328
+ sample = "\n".join(",".join(str(v) for v in r) for r in result[:40])
329
+
330
+ elif mode == "pages":
331
+ out_dir = out_m.group(1) if out_m else "crawl_pages"
332
+ count = _mode_pages(start_url, max_depth, out_dir, url_filter)
333
+ summary_hint = f"{count} pages saved to {out_dir}"
334
+ sample = f"Directory: {out_dir}"
335
+
336
+ elif mode == "combine":
337
+ out_path = out_m.group(1) if out_m else "crawl_combined.txt"
338
+ parts = _mode_combine(start_url, max_depth, out_path, url_filter)
339
+ summary_hint = f"{len(parts)} pages combined to {out_path}"
340
+ sample = "\n\n---\n\n".join(p[:500] for p in parts[:3])
341
+
342
+ elif mode == "mirror":
343
+ out_dir = out_m.group(1) if out_m else "crawl_mirror"
344
+ count = _mode_mirror(start_url, max_depth, out_dir, url_filter)
345
+ summary_hint = f"{count} html files mirrored to {out_dir}"
346
+ sample = f"Directory: {out_dir}"
347
+
348
+ else:
349
+ print(f"[webcrawl] unknown mode '{mode}' — use: extract | pages | combine | mirror")
350
+ return
351
+
352
+ if not ask_summary:
353
+ return
354
+
355
+ prompt = (
356
+ f"I crawled {start_url} in '{mode}' mode (depth={max_depth}).\n"
357
+ f"Result: {summary_hint}\n\n"
358
+ f"Sample output:\n{sample}\n\n"
359
+ f"Summarize what was found and note anything interesting or unexpected."
360
+ )
361
+ temp_msgs = [{"role": "system", "content": chat._role},
362
+ {"role": "user", "content": prompt}]
363
+ chat._sep("AI")
364
+ reply = chat._stream_chat(temp_msgs)
365
+ if reply:
366
+ chat.last_reply = reply
367
+ chat._last_output = reply
368
+ chat.messages.append({"role": "user", "content": f"[webcrawl: {start_url} mode={mode}]"})
369
+ chat.messages.append({"role": "assistant", "content": reply})
@@ -44,6 +44,16 @@ import argparse
44
44
  DEFAULT_MAP = os.path.join('.1bcoder', 'map.txt')
45
45
 
46
46
 
47
+ def _truncate(text: str, max_lines: int) -> str:
48
+ if max_lines <= 0:
49
+ return text
50
+ lines = text.splitlines()
51
+ if len(lines) <= max_lines:
52
+ return text
53
+ hidden = len(lines) - max_lines
54
+ return '\n'.join(lines[:max_lines]) + f'\n\n[truncated: {hidden} lines hidden — use --max-lines to see more]'
55
+
56
+
47
57
  # ── parse ───────────────────────────────────────────────────────────────────────
48
58
 
49
59
  def parse_map(map_path: str) -> tuple:
@@ -145,12 +155,13 @@ def compute_cohesion(links_map: dict, k: int = 5) -> tuple:
145
155
 
146
156
  # ── find ────────────────────────────────────────────────────────────────────────
147
157
 
148
- def find_map(map_path: str, query: str) -> tuple:
158
+ def find_map(map_path: str, query: str, max_lines: int = 0) -> tuple:
149
159
  """Search map.txt with filter syntax.
150
160
 
151
161
  Returns (hits, rendered_string).
152
162
  hits — list of matching block strings (empty list means full map returned).
153
163
  rendered_string — the text to display / inject.
164
+ max_lines — hard cap on output lines (0 = unlimited).
154
165
  """
155
166
  with open(map_path, encoding='utf-8') as f:
156
167
  content = f.read()
@@ -220,7 +231,7 @@ def find_map(map_path: str, query: str) -> tuple:
220
231
  if not b.startswith('#')
221
232
  for r in [process_block(b)] if r is not None]
222
233
 
223
- return hits, '\n'.join(hits)
234
+ return hits, _truncate('\n'.join(hits), max_lines)
224
235
 
225
236
 
226
237
  # ── trace ───────────────────────────────────────────────────────────────────────
@@ -301,10 +312,11 @@ def trace_deps(map_path: str, identifier: str, max_depth: int = 8, leaves_only:
301
312
  return '\n'.join(lines_out)
302
313
 
303
314
 
304
- def trace_map(map_path: str, identifier: str, max_depth: int = 8) -> str:
315
+ def trace_map(map_path: str, identifier: str, max_depth: int = 8, max_lines: int = 0) -> str:
305
316
  """BFS backwards through the call graph from a defined identifier.
306
317
 
307
318
  Returns a rendered string, or None if the identifier is not found in defines.
319
+ max_lines — hard cap on output lines (0 = unlimited).
308
320
  """
309
321
  defines_map, links_map = parse_map(map_path)
310
322
 
@@ -347,7 +359,7 @@ def trace_map(map_path: str, identifier: str, max_depth: int = 8) -> str:
347
359
  visited.add(caller)
348
360
  queue.append((caller, depth + 1))
349
361
 
350
- return '\n'.join(lines_out)
362
+ return _truncate('\n'.join(lines_out), max_lines)
351
363
 
352
364
 
353
365
  def _resolve_id(token: str, defines_map: dict):
@@ -628,9 +640,13 @@ def main():
628
640
  p_find = sub.add_parser('find', help='Filter map blocks by filename/content')
629
641
  p_find.add_argument('query', nargs='*',
630
642
  help='Filter tokens (term, !term, \\term, \\!term, \\\\!term)')
643
+ p_find.add_argument('--max-lines', type=int, default=200, metavar='N',
644
+ help='Hard cap on output lines (default: 200, 0 = unlimited)')
631
645
 
632
646
  p_trace = sub.add_parser('trace', help='Follow call chain backwards from an identifier')
633
647
  p_trace.add_argument('identifier', help='Identifier name to trace')
648
+ p_trace.add_argument('--max-lines', type=int, default=200, metavar='N',
649
+ help='Hard cap on output lines (default: 200, 0 = unlimited)')
634
650
 
635
651
  p_idiff = sub.add_parser('idiff', help='ORPHAN_DRIFT + GHOST ALERT between two map snapshots')
636
652
  p_idiff.add_argument('--prev', required=True, metavar='FILE',
@@ -645,7 +661,7 @@ def main():
645
661
 
646
662
  if args.cmd == 'find':
647
663
  query = ' '.join(args.query)
648
- hits, result = find_map(args.map, query)
664
+ hits, result = find_map(args.map, query, max_lines=args.max_lines)
649
665
  if not query:
650
666
  print(result)
651
667
  elif hits:
@@ -656,7 +672,7 @@ def main():
656
672
  sys.exit(1)
657
673
 
658
674
  elif args.cmd == 'trace':
659
- result = trace_map(args.map, args.identifier)
675
+ result = trace_map(args.map, args.identifier, max_lines=args.max_lines)
660
676
  if result is None:
661
677
  print(f"[map] '{args.identifier}' not found in any defines", file=sys.stderr)
662
678
  print(f"hint: try: python map_query.py find \\{args.identifier}", file=sys.stderr)
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "1bcoder"
7
- version = "0.1.12"
7
+ version = "0.1.13"
8
8
  description = "AI coding assistant agent for 1B–7B local models (Ollama, LMStudio, llama.cpp). Terminal REPL with file editing, project map, agents, scripts, and parallel multi-model queries."
9
9
  requires-python = ">=3.10"
10
10
  readme = {file = "README.md", content-type = "text/markdown"}
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes