1bcoder 0.1.12__tar.gz → 0.1.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/1bcoder.egg-info/PKG-INFO +1 -1
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/1bcoder.egg-info/SOURCES.txt +2 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/PKG-INFO +1 -1
- 1bcoder-0.1.13/_bcoder_data/flows/__pycache__/webcrawl.cpython-311.pyc +0 -0
- 1bcoder-0.1.13/_bcoder_data/flows/webcrawl.py +369 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/map_query.py +22 -6
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/pyproject.toml +1 -1
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/1bcoder.egg-info/dependency_links.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/1bcoder.egg-info/entry_points.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/1bcoder.egg-info/requires.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/1bcoder.egg-info/top_level.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/LICENSE +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/README.md +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/__init__.py +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/agents/advance.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/agents/ask.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/agents/compact.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/agents/concepts.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/agents/fill.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/agents/planning.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/agents/scan.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/agents/sqlite.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/agents/websearch.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/aliases.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/doc/FLOWS.md +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/doc/MCP.md +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/doc/OLLAMA_SERVER_PARAM.md +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/doc/PARAM.md +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/doc/PROC.md +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/doc/TRANSLATE.md +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/flows/__pycache__/commit_message.cpython-311.pyc +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/flows/commit_message.py +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/flows/deepagent.py +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/flows/grounding.py +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/flows/py_error_trace.py +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/flows/simargl_files.py +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/flows/visual_search.py +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/flows/webask.py +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/map.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/action-required.py +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/add-save.py +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/assist.py +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/collect-files.py +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/ctx_cut.py +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/extract-code.py +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/extract-files.py +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/extract-list.py +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/grounding-check.py +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/md.py +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/mdx.py +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/pattern-gate.py +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/regexp-extract.py +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/rude_words.py +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/scan-save.py +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/secret_check.py +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/sql_readonly_guard.py +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/proc/tempctx-cut.py +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/profiles.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/prompts/analysis.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/prompts/sumarise.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/prompts.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/AddFunction.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/AskProject.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/CheckRequirements.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/DockerMySQL.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/DockerNginx.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/DockerPython.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/DockerStack.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/DuckDuckGoInstant.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/EnvTemplate.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/Explain.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/ExploreProjectStructure.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/GitIgnorePython.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/MySQLDump.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/NewScript.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/PipFreeze.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/PyPI.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/Refactor.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/RunAndFix.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/SQLiteSchema.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/Translate.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/WikiPage.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/WikiSearch.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/auto-bkup.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/edit-control.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/parallel_call.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/personal/content/create-regular-content.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/personal/content/plan.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/personal/test/collect-data-from-test-environment.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/plan.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/remote/create-content-on-remote-server.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/set_ctx.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/simargl-cli_index_files.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/simargl-cli_index_units.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/simargl-cli_search.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/team-map-worker.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/team-search-worker.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/team-summarize.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/team-tree-worker.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/test.txt +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/teams/code-analysis.yaml +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/chat.py +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/map_index.py +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/setup.cfg +0 -0
- {1bcoder-0.1.12 → 1bcoder-0.1.13}/tests/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: 1bcoder
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.13
|
|
4
4
|
Summary: AI coding assistant agent for 1B–7B local models (Ollama, LMStudio, llama.cpp). Terminal REPL with file editing, project map, agents, scripts, and parallel multi-model queries.
|
|
5
5
|
Project-URL: Homepage, https://github.com/szholobetsky/1bcoder
|
|
6
6
|
Project-URL: Repository, https://github.com/szholobetsky/1bcoder
|
|
@@ -37,7 +37,9 @@ _bcoder_data/flows/py_error_trace.py
|
|
|
37
37
|
_bcoder_data/flows/simargl_files.py
|
|
38
38
|
_bcoder_data/flows/visual_search.py
|
|
39
39
|
_bcoder_data/flows/webask.py
|
|
40
|
+
_bcoder_data/flows/webcrawl.py
|
|
40
41
|
_bcoder_data/flows/__pycache__/commit_message.cpython-311.pyc
|
|
42
|
+
_bcoder_data/flows/__pycache__/webcrawl.cpython-311.pyc
|
|
41
43
|
_bcoder_data/proc/action-required.py
|
|
42
44
|
_bcoder_data/proc/add-save.py
|
|
43
45
|
_bcoder_data/proc/assist.py
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: 1bcoder
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.13
|
|
4
4
|
Summary: AI coding assistant agent for 1B–7B local models (Ollama, LMStudio, llama.cpp). Terminal REPL with file editing, project map, agents, scripts, and parallel multi-model queries.
|
|
5
5
|
Project-URL: Homepage, https://github.com/szholobetsky/1bcoder
|
|
6
6
|
Project-URL: Repository, https://github.com/szholobetsky/1bcoder
|
|
Binary file
|
|
@@ -0,0 +1,369 @@
|
|
|
1
|
+
"""Crawl URLs, save pages or extract structured data.
|
|
2
|
+
|
|
3
|
+
Modes:
|
|
4
|
+
extract (default) — XPath columns → CSV
|
|
5
|
+
pages — each page as .txt file (URL structure preserved)
|
|
6
|
+
combine — all pages as one .txt file
|
|
7
|
+
mirror — save .html files with relative links
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
/flow webcrawl <url> name:"//h1/text()" price:"//span/text()" [--out result.csv]
|
|
11
|
+
/flow webcrawl <url> --columns cols.yaml [--out result.csv]
|
|
12
|
+
/flow webcrawl <url> --mode pages --out ./pages/ [--depth 2]
|
|
13
|
+
/flow webcrawl <url> --mode combine --out all.txt [--depth 2]
|
|
14
|
+
/flow webcrawl <url> --mode mirror --out ./mirror/ [--depth 2]
|
|
15
|
+
/flow webcrawl <url> --mode pages --filter mysite.com/docs --out ./docs/
|
|
16
|
+
|
|
17
|
+
Add --ask to any mode for LLM summary after completion.
|
|
18
|
+
|
|
19
|
+
--columns YAML format (cols.yaml):
|
|
20
|
+
name: "//h1/text()"
|
|
21
|
+
price: "//span[@class='price']/text()"
|
|
22
|
+
sku: "//*[@class='sku']/text()"
|
|
23
|
+
category: "//nav[@class='breadcrumb']/a[last()]/text()"
|
|
24
|
+
|
|
25
|
+
Note: use single quotes inside XPath for attribute values — @class='price'
|
|
26
|
+
Each column runs its XPath on every page; rows are zipped across columns.
|
|
27
|
+
"""
|
|
28
|
+
import re as _re
|
|
29
|
+
import os as _os
|
|
30
|
+
import csv as _csv
|
|
31
|
+
from collections import deque as _deque
|
|
32
|
+
from itertools import zip_longest as _zip_longest
|
|
33
|
+
from urllib.request import urlopen as _urlopen, Request as _Request
|
|
34
|
+
from urllib.parse import urljoin as _urljoin, urlparse as _urlparse
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# ---------------------------------------------------------------------------
|
|
38
|
+
# HTTP
|
|
39
|
+
# ---------------------------------------------------------------------------
|
|
40
|
+
|
|
41
|
+
def _fetch(url: str, timeout: int = 12) -> bytes | None:
|
|
42
|
+
try:
|
|
43
|
+
req = _Request(url, headers={"User-Agent": "Mozilla/5.0"})
|
|
44
|
+
with _urlopen(req, timeout=timeout) as r:
|
|
45
|
+
ct = r.headers.get("Content-Type", "")
|
|
46
|
+
if "html" not in ct and "xml" not in ct and "text" not in ct:
|
|
47
|
+
return None
|
|
48
|
+
return r.read()
|
|
49
|
+
except Exception as e:
|
|
50
|
+
print(f"[webcrawl] skip {url}: {e}")
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _extract_links(tree, base_url: str, domain: str) -> list[str]:
|
|
55
|
+
links = []
|
|
56
|
+
for href in tree.xpath("//a/@href"):
|
|
57
|
+
abs_url = _urljoin(base_url, href)
|
|
58
|
+
p = _urlparse(abs_url)
|
|
59
|
+
if p.scheme in ("http", "https") and p.netloc == domain:
|
|
60
|
+
links.append(abs_url.split("#")[0])
|
|
61
|
+
return links
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# ---------------------------------------------------------------------------
|
|
65
|
+
# HTML → plain text
|
|
66
|
+
# ---------------------------------------------------------------------------
|
|
67
|
+
|
|
68
|
+
def _to_text(tree) -> str:
|
|
69
|
+
for tag in tree.xpath("//script|//style|//nav|//header|//footer|//aside"):
|
|
70
|
+
p = tag.getparent()
|
|
71
|
+
if p is not None:
|
|
72
|
+
p.remove(tag)
|
|
73
|
+
body = tree.find(".//body")
|
|
74
|
+
raw = (body if body is not None else tree).text_content()
|
|
75
|
+
# collapse whitespace
|
|
76
|
+
lines = [ln.strip() for ln in raw.splitlines()]
|
|
77
|
+
return "\n".join(ln for ln in lines if ln)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# ---------------------------------------------------------------------------
|
|
81
|
+
# URL → local file path
|
|
82
|
+
# ---------------------------------------------------------------------------
|
|
83
|
+
|
|
84
|
+
def _url_to_relpath(url: str, ext: str) -> str:
|
|
85
|
+
path = _urlparse(url).path.strip("/")
|
|
86
|
+
if not path:
|
|
87
|
+
return "index" + ext
|
|
88
|
+
if path.endswith("/"):
|
|
89
|
+
path = path.rstrip("/") + "/index"
|
|
90
|
+
root, _ = _os.path.splitext(path)
|
|
91
|
+
return root + ext
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# ---------------------------------------------------------------------------
|
|
95
|
+
# Argument parsing helpers
|
|
96
|
+
# ---------------------------------------------------------------------------
|
|
97
|
+
|
|
98
|
+
def _parse_inline_columns(args: str) -> dict[str, str]:
|
|
99
|
+
"""Parse name:"//xpath" pairs — XPath must use single quotes for attributes."""
|
|
100
|
+
return {m.group(1): m.group(2)
|
|
101
|
+
for m in _re.finditer(r'(\w+):"([^"]*)"', args)}
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _load_yaml_columns(path: str) -> dict[str, str]:
|
|
105
|
+
try:
|
|
106
|
+
import yaml as _yaml
|
|
107
|
+
with open(path, encoding="utf-8") as f:
|
|
108
|
+
return _yaml.safe_load(f)
|
|
109
|
+
except ImportError:
|
|
110
|
+
# fallback: naive key: "value" parser
|
|
111
|
+
result = {}
|
|
112
|
+
with open(path, encoding="utf-8") as f:
|
|
113
|
+
for line in f:
|
|
114
|
+
m = _re.match(r'\s*(\w+)\s*:\s*["\']?(.+?)["\']?\s*$', line)
|
|
115
|
+
if m:
|
|
116
|
+
result[m.group(1)] = m.group(2).strip('"\'')
|
|
117
|
+
return result
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _strip_flags(args: str) -> str:
|
|
121
|
+
for pat in (r'\w+:"[^"]*"', r"--columns\s+\S+", r"--mode\s+\S+",
|
|
122
|
+
r"--depth\s+\d+", r"--out\s+\S+", r"--filter\s+\S+", r"--ask"):
|
|
123
|
+
args = _re.sub(pat, "", args)
|
|
124
|
+
return args.strip()
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
# ---------------------------------------------------------------------------
|
|
128
|
+
# BFS crawler
|
|
129
|
+
# ---------------------------------------------------------------------------
|
|
130
|
+
|
|
131
|
+
def _normalize_filter(f: str) -> str:
|
|
132
|
+
"""Ensure filter has scheme and no trailing slash."""
|
|
133
|
+
if not f.startswith("http"):
|
|
134
|
+
f = "https://" + f
|
|
135
|
+
return f.rstrip("/")
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _crawl(start_url: str, max_depth: int, url_filter: str | None = None):
|
|
139
|
+
"""Yield (url, depth, raw_bytes, lxml_tree) for each reachable page."""
|
|
140
|
+
try:
|
|
141
|
+
from lxml import etree as _et
|
|
142
|
+
except ImportError:
|
|
143
|
+
raise ImportError("lxml not installed — run: pip install lxml")
|
|
144
|
+
|
|
145
|
+
prefix = _normalize_filter(url_filter) if url_filter else None
|
|
146
|
+
domain = _urlparse(start_url).netloc
|
|
147
|
+
queue = _deque([(start_url, 0)])
|
|
148
|
+
visited: set[str] = set()
|
|
149
|
+
|
|
150
|
+
while queue:
|
|
151
|
+
url, depth = queue.popleft()
|
|
152
|
+
if url in visited:
|
|
153
|
+
continue
|
|
154
|
+
if prefix and not url.startswith(prefix):
|
|
155
|
+
continue
|
|
156
|
+
visited.add(url)
|
|
157
|
+
|
|
158
|
+
print(f"[webcrawl] ({depth}/{max_depth}) {url}")
|
|
159
|
+
raw = _fetch(url)
|
|
160
|
+
if raw is None:
|
|
161
|
+
continue
|
|
162
|
+
|
|
163
|
+
try:
|
|
164
|
+
tree = _et.fromstring(raw, _et.HTMLParser())
|
|
165
|
+
except Exception as e:
|
|
166
|
+
print(f"[webcrawl] parse error {url}: {e}")
|
|
167
|
+
continue
|
|
168
|
+
|
|
169
|
+
yield url, depth, raw, tree
|
|
170
|
+
|
|
171
|
+
if depth < max_depth:
|
|
172
|
+
for link in _extract_links(tree, url, domain):
|
|
173
|
+
if link not in visited:
|
|
174
|
+
queue.append((link, depth + 1))
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
# ---------------------------------------------------------------------------
|
|
178
|
+
# Modes
|
|
179
|
+
# ---------------------------------------------------------------------------
|
|
180
|
+
|
|
181
|
+
def _mode_extract(start_url, max_depth, columns, out_path, url_filter=None):
|
|
182
|
+
rows = []
|
|
183
|
+
col_names = list(columns.keys())
|
|
184
|
+
xpaths = list(columns.values())
|
|
185
|
+
|
|
186
|
+
for url, depth, raw, tree in _crawl(start_url, max_depth, url_filter):
|
|
187
|
+
results = []
|
|
188
|
+
for xp in xpaths:
|
|
189
|
+
nodes = tree.xpath(xp)
|
|
190
|
+
vals = []
|
|
191
|
+
for n in nodes:
|
|
192
|
+
text = n.text_content().strip() if hasattr(n, "text_content") else str(n).strip()
|
|
193
|
+
if text:
|
|
194
|
+
vals.append(text)
|
|
195
|
+
results.append(vals)
|
|
196
|
+
|
|
197
|
+
for values in _zip_longest(*results, fillvalue=""):
|
|
198
|
+
rows.append((url,) + tuple(values))
|
|
199
|
+
|
|
200
|
+
with open(out_path, "w", newline="", encoding="utf-8") as f:
|
|
201
|
+
w = _csv.writer(f)
|
|
202
|
+
w.writerow(["url"] + col_names)
|
|
203
|
+
w.writerows(rows)
|
|
204
|
+
|
|
205
|
+
print(f"[webcrawl] extract done — {len(rows)} rows → {out_path}")
|
|
206
|
+
return rows
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _mode_pages(start_url, max_depth, out_dir, url_filter=None):
|
|
210
|
+
_os.makedirs(out_dir, exist_ok=True)
|
|
211
|
+
count = 0
|
|
212
|
+
for url, depth, raw, tree in _crawl(start_url, max_depth, url_filter):
|
|
213
|
+
rel = _url_to_relpath(url, ".txt")
|
|
214
|
+
dest = _os.path.join(out_dir, rel)
|
|
215
|
+
_os.makedirs(_os.path.dirname(dest), exist_ok=True)
|
|
216
|
+
with open(dest, "w", encoding="utf-8") as f:
|
|
217
|
+
f.write(f"URL: {url}\n\n")
|
|
218
|
+
f.write(_to_text(tree))
|
|
219
|
+
count += 1
|
|
220
|
+
print(f"[webcrawl] pages done — {count} files → {out_dir}")
|
|
221
|
+
return count
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _mode_combine(start_url, max_depth, out_path, url_filter=None):
|
|
225
|
+
parts = []
|
|
226
|
+
for url, depth, raw, tree in _crawl(start_url, max_depth, url_filter):
|
|
227
|
+
text = _to_text(tree)
|
|
228
|
+
parts.append(f"=== {url} ===\n\n{text}")
|
|
229
|
+
with open(out_path, "w", encoding="utf-8") as f:
|
|
230
|
+
f.write("\n\n" + ("─" * 60) + "\n\n".join(parts))
|
|
231
|
+
print(f"[webcrawl] combine done — {len(parts)} pages → {out_path}")
|
|
232
|
+
return parts
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def _mode_mirror(start_url, max_depth, out_dir, url_filter=None):
|
|
236
|
+
from lxml import etree as _et
|
|
237
|
+
|
|
238
|
+
_os.makedirs(out_dir, exist_ok=True)
|
|
239
|
+
|
|
240
|
+
# pass 1 — crawl, collect pages, build url→local_file index
|
|
241
|
+
pages: list[tuple[str, bytes, object]] = []
|
|
242
|
+
url_to_file: dict[str, str] = {}
|
|
243
|
+
for url, depth, raw, tree in _crawl(start_url, max_depth, url_filter):
|
|
244
|
+
rel = _url_to_relpath(url, ".html")
|
|
245
|
+
dest = _os.path.join(out_dir, rel)
|
|
246
|
+
pages.append((url, raw, tree))
|
|
247
|
+
url_to_file[url] = dest
|
|
248
|
+
|
|
249
|
+
# pass 2 — rewrite links, save
|
|
250
|
+
for url, raw, tree in pages:
|
|
251
|
+
dest = url_to_file[url]
|
|
252
|
+
from_dir = _os.path.dirname(dest)
|
|
253
|
+
|
|
254
|
+
# remove <base href> — it overrides all relative links and points back to original site
|
|
255
|
+
for base in tree.xpath("//base"):
|
|
256
|
+
p = base.getparent()
|
|
257
|
+
if p is not None:
|
|
258
|
+
p.remove(base)
|
|
259
|
+
|
|
260
|
+
for a in tree.xpath("//a[@href]"):
|
|
261
|
+
href = a.get("href", "")
|
|
262
|
+
abs_href = _urljoin(url, href).split("#")[0]
|
|
263
|
+
if abs_href in url_to_file:
|
|
264
|
+
to_file = url_to_file[abs_href]
|
|
265
|
+
rel_path = _os.path.relpath(to_file, from_dir).replace("\\", "/")
|
|
266
|
+
a.set("href", rel_path)
|
|
267
|
+
|
|
268
|
+
_os.makedirs(from_dir, exist_ok=True)
|
|
269
|
+
html_bytes = _et.tostring(tree, method="html", encoding="unicode").encode("utf-8")
|
|
270
|
+
with open(dest, "wb") as f:
|
|
271
|
+
f.write(html_bytes)
|
|
272
|
+
|
|
273
|
+
print(f"[webcrawl] mirror done — {len(pages)} html files → {out_dir}")
|
|
274
|
+
return len(pages)
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
# ---------------------------------------------------------------------------
|
|
278
|
+
# Entry point
|
|
279
|
+
# ---------------------------------------------------------------------------
|
|
280
|
+
|
|
281
|
+
def run(chat, args: str):
|
|
282
|
+
mode_m = _re.search(r"--mode\s+(\S+)", args)
|
|
283
|
+
depth_m = _re.search(r"--depth\s+(\d+)", args)
|
|
284
|
+
out_m = _re.search(r"--out\s+(\S+)", args)
|
|
285
|
+
columns_m = _re.search(r"--columns\s+(\S+)", args)
|
|
286
|
+
filter_m = _re.search(r"--filter\s+(\S+)", args)
|
|
287
|
+
ask = "--ask" in args
|
|
288
|
+
|
|
289
|
+
mode = mode_m.group(1) if mode_m else "extract"
|
|
290
|
+
max_depth = int(depth_m.group(1)) if depth_m else 2
|
|
291
|
+
url_filter = filter_m.group(1) if filter_m else None
|
|
292
|
+
ask_summary = ask
|
|
293
|
+
|
|
294
|
+
if url_filter:
|
|
295
|
+
print(f"[webcrawl] filter: only pages under {_normalize_filter(url_filter)}")
|
|
296
|
+
|
|
297
|
+
start_url = _strip_flags(args)
|
|
298
|
+
if not start_url.startswith("http"):
|
|
299
|
+
print(
|
|
300
|
+
"usage:\n"
|
|
301
|
+
" /flow webcrawl <url> name:\"//h1/text()\" price:\"//span/text()\" [--out result.csv]\n"
|
|
302
|
+
" /flow webcrawl <url> --columns cols.yaml [--out result.csv]\n"
|
|
303
|
+
" /flow webcrawl <url> --mode pages --out ./pages/\n"
|
|
304
|
+
" /flow webcrawl <url> --mode combine --out all.txt\n"
|
|
305
|
+
" /flow webcrawl <url> --mode mirror --out ./mirror/"
|
|
306
|
+
)
|
|
307
|
+
return
|
|
308
|
+
|
|
309
|
+
try:
|
|
310
|
+
from lxml import etree # noqa: F401
|
|
311
|
+
except ImportError:
|
|
312
|
+
print("[webcrawl] lxml not installed — run: pip install lxml")
|
|
313
|
+
return
|
|
314
|
+
|
|
315
|
+
# ---- dispatch ----
|
|
316
|
+
|
|
317
|
+
if mode == "extract":
|
|
318
|
+
if columns_m:
|
|
319
|
+
columns = _load_yaml_columns(columns_m.group(1))
|
|
320
|
+
else:
|
|
321
|
+
columns = _parse_inline_columns(args)
|
|
322
|
+
if not columns:
|
|
323
|
+
print("[webcrawl] extract mode requires columns — inline or --columns file.yaml")
|
|
324
|
+
return
|
|
325
|
+
out_path = out_m.group(1) if out_m else "crawl_result.csv"
|
|
326
|
+
result = _mode_extract(start_url, max_depth, columns, out_path, url_filter)
|
|
327
|
+
summary_hint = f"{len(result)} rows extracted to {out_path}"
|
|
328
|
+
sample = "\n".join(",".join(str(v) for v in r) for r in result[:40])
|
|
329
|
+
|
|
330
|
+
elif mode == "pages":
|
|
331
|
+
out_dir = out_m.group(1) if out_m else "crawl_pages"
|
|
332
|
+
count = _mode_pages(start_url, max_depth, out_dir, url_filter)
|
|
333
|
+
summary_hint = f"{count} pages saved to {out_dir}"
|
|
334
|
+
sample = f"Directory: {out_dir}"
|
|
335
|
+
|
|
336
|
+
elif mode == "combine":
|
|
337
|
+
out_path = out_m.group(1) if out_m else "crawl_combined.txt"
|
|
338
|
+
parts = _mode_combine(start_url, max_depth, out_path, url_filter)
|
|
339
|
+
summary_hint = f"{len(parts)} pages combined to {out_path}"
|
|
340
|
+
sample = "\n\n---\n\n".join(p[:500] for p in parts[:3])
|
|
341
|
+
|
|
342
|
+
elif mode == "mirror":
|
|
343
|
+
out_dir = out_m.group(1) if out_m else "crawl_mirror"
|
|
344
|
+
count = _mode_mirror(start_url, max_depth, out_dir, url_filter)
|
|
345
|
+
summary_hint = f"{count} html files mirrored to {out_dir}"
|
|
346
|
+
sample = f"Directory: {out_dir}"
|
|
347
|
+
|
|
348
|
+
else:
|
|
349
|
+
print(f"[webcrawl] unknown mode '{mode}' — use: extract | pages | combine | mirror")
|
|
350
|
+
return
|
|
351
|
+
|
|
352
|
+
if not ask_summary:
|
|
353
|
+
return
|
|
354
|
+
|
|
355
|
+
prompt = (
|
|
356
|
+
f"I crawled {start_url} in '{mode}' mode (depth={max_depth}).\n"
|
|
357
|
+
f"Result: {summary_hint}\n\n"
|
|
358
|
+
f"Sample output:\n{sample}\n\n"
|
|
359
|
+
f"Summarize what was found and note anything interesting or unexpected."
|
|
360
|
+
)
|
|
361
|
+
temp_msgs = [{"role": "system", "content": chat._role},
|
|
362
|
+
{"role": "user", "content": prompt}]
|
|
363
|
+
chat._sep("AI")
|
|
364
|
+
reply = chat._stream_chat(temp_msgs)
|
|
365
|
+
if reply:
|
|
366
|
+
chat.last_reply = reply
|
|
367
|
+
chat._last_output = reply
|
|
368
|
+
chat.messages.append({"role": "user", "content": f"[webcrawl: {start_url} mode={mode}]"})
|
|
369
|
+
chat.messages.append({"role": "assistant", "content": reply})
|
|
@@ -44,6 +44,16 @@ import argparse
|
|
|
44
44
|
DEFAULT_MAP = os.path.join('.1bcoder', 'map.txt')
|
|
45
45
|
|
|
46
46
|
|
|
47
|
+
def _truncate(text: str, max_lines: int) -> str:
|
|
48
|
+
if max_lines <= 0:
|
|
49
|
+
return text
|
|
50
|
+
lines = text.splitlines()
|
|
51
|
+
if len(lines) <= max_lines:
|
|
52
|
+
return text
|
|
53
|
+
hidden = len(lines) - max_lines
|
|
54
|
+
return '\n'.join(lines[:max_lines]) + f'\n\n[truncated: {hidden} lines hidden — use --max-lines to see more]'
|
|
55
|
+
|
|
56
|
+
|
|
47
57
|
# ── parse ───────────────────────────────────────────────────────────────────────
|
|
48
58
|
|
|
49
59
|
def parse_map(map_path: str) -> tuple:
|
|
@@ -145,12 +155,13 @@ def compute_cohesion(links_map: dict, k: int = 5) -> tuple:
|
|
|
145
155
|
|
|
146
156
|
# ── find ────────────────────────────────────────────────────────────────────────
|
|
147
157
|
|
|
148
|
-
def find_map(map_path: str, query: str) -> tuple:
|
|
158
|
+
def find_map(map_path: str, query: str, max_lines: int = 0) -> tuple:
|
|
149
159
|
"""Search map.txt with filter syntax.
|
|
150
160
|
|
|
151
161
|
Returns (hits, rendered_string).
|
|
152
162
|
hits — list of matching block strings (empty list means full map returned).
|
|
153
163
|
rendered_string — the text to display / inject.
|
|
164
|
+
max_lines — hard cap on output lines (0 = unlimited).
|
|
154
165
|
"""
|
|
155
166
|
with open(map_path, encoding='utf-8') as f:
|
|
156
167
|
content = f.read()
|
|
@@ -220,7 +231,7 @@ def find_map(map_path: str, query: str) -> tuple:
|
|
|
220
231
|
if not b.startswith('#')
|
|
221
232
|
for r in [process_block(b)] if r is not None]
|
|
222
233
|
|
|
223
|
-
return hits, '\n'.join(hits)
|
|
234
|
+
return hits, _truncate('\n'.join(hits), max_lines)
|
|
224
235
|
|
|
225
236
|
|
|
226
237
|
# ── trace ───────────────────────────────────────────────────────────────────────
|
|
@@ -301,10 +312,11 @@ def trace_deps(map_path: str, identifier: str, max_depth: int = 8, leaves_only:
|
|
|
301
312
|
return '\n'.join(lines_out)
|
|
302
313
|
|
|
303
314
|
|
|
304
|
-
def trace_map(map_path: str, identifier: str, max_depth: int = 8) -> str:
|
|
315
|
+
def trace_map(map_path: str, identifier: str, max_depth: int = 8, max_lines: int = 0) -> str:
|
|
305
316
|
"""BFS backwards through the call graph from a defined identifier.
|
|
306
317
|
|
|
307
318
|
Returns a rendered string, or None if the identifier is not found in defines.
|
|
319
|
+
max_lines — hard cap on output lines (0 = unlimited).
|
|
308
320
|
"""
|
|
309
321
|
defines_map, links_map = parse_map(map_path)
|
|
310
322
|
|
|
@@ -347,7 +359,7 @@ def trace_map(map_path: str, identifier: str, max_depth: int = 8) -> str:
|
|
|
347
359
|
visited.add(caller)
|
|
348
360
|
queue.append((caller, depth + 1))
|
|
349
361
|
|
|
350
|
-
return '\n'.join(lines_out)
|
|
362
|
+
return _truncate('\n'.join(lines_out), max_lines)
|
|
351
363
|
|
|
352
364
|
|
|
353
365
|
def _resolve_id(token: str, defines_map: dict):
|
|
@@ -628,9 +640,13 @@ def main():
|
|
|
628
640
|
p_find = sub.add_parser('find', help='Filter map blocks by filename/content')
|
|
629
641
|
p_find.add_argument('query', nargs='*',
|
|
630
642
|
help='Filter tokens (term, !term, \\term, \\!term, \\\\!term)')
|
|
643
|
+
p_find.add_argument('--max-lines', type=int, default=200, metavar='N',
|
|
644
|
+
help='Hard cap on output lines (default: 200, 0 = unlimited)')
|
|
631
645
|
|
|
632
646
|
p_trace = sub.add_parser('trace', help='Follow call chain backwards from an identifier')
|
|
633
647
|
p_trace.add_argument('identifier', help='Identifier name to trace')
|
|
648
|
+
p_trace.add_argument('--max-lines', type=int, default=200, metavar='N',
|
|
649
|
+
help='Hard cap on output lines (default: 200, 0 = unlimited)')
|
|
634
650
|
|
|
635
651
|
p_idiff = sub.add_parser('idiff', help='ORPHAN_DRIFT + GHOST ALERT between two map snapshots')
|
|
636
652
|
p_idiff.add_argument('--prev', required=True, metavar='FILE',
|
|
@@ -645,7 +661,7 @@ def main():
|
|
|
645
661
|
|
|
646
662
|
if args.cmd == 'find':
|
|
647
663
|
query = ' '.join(args.query)
|
|
648
|
-
hits, result = find_map(args.map, query)
|
|
664
|
+
hits, result = find_map(args.map, query, max_lines=args.max_lines)
|
|
649
665
|
if not query:
|
|
650
666
|
print(result)
|
|
651
667
|
elif hits:
|
|
@@ -656,7 +672,7 @@ def main():
|
|
|
656
672
|
sys.exit(1)
|
|
657
673
|
|
|
658
674
|
elif args.cmd == 'trace':
|
|
659
|
-
result = trace_map(args.map, args.identifier)
|
|
675
|
+
result = trace_map(args.map, args.identifier, max_lines=args.max_lines)
|
|
660
676
|
if result is None:
|
|
661
677
|
print(f"[map] '{args.identifier}' not found in any defines", file=sys.stderr)
|
|
662
678
|
print(f"hint: try: python map_query.py find \\{args.identifier}", file=sys.stderr)
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "1bcoder"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.13"
|
|
8
8
|
description = "AI coding assistant agent for 1B–7B local models (Ollama, LMStudio, llama.cpp). Terminal REPL with file editing, project map, agents, scripts, and parallel multi-model queries."
|
|
9
9
|
requires-python = ">=3.10"
|
|
10
10
|
readme = {file = "README.md", content-type = "text/markdown"}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/flows/__pycache__/commit_message.cpython-311.pyc
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/personal/content/create-regular-content.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{1bcoder-0.1.12 → 1bcoder-0.1.13}/_bcoder_data/scripts/remote/create-content-on-remote-server.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|