omniscout 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. omniscout-0.1.0/.python-version +1 -0
  2. omniscout-0.1.0/CHANGELOG.md +19 -0
  3. omniscout-0.1.0/PKG-INFO +197 -0
  4. omniscout-0.1.0/README.md +156 -0
  5. omniscout-0.1.0/RELEASING.md +73 -0
  6. omniscout-0.1.0/harness/__init__.py +3 -0
  7. omniscout-0.1.0/harness/__main__.py +6 -0
  8. omniscout-0.1.0/harness/app.py +306 -0
  9. omniscout-0.1.0/harness/client.py +94 -0
  10. omniscout-0.1.0/harness/commands/__init__.py +0 -0
  11. omniscout-0.1.0/harness/commands/benchmark.py +186 -0
  12. omniscout-0.1.0/harness/commands/browser.py +597 -0
  13. omniscout-0.1.0/harness/commands/daemon.py +349 -0
  14. omniscout-0.1.0/harness/commands/extract.py +37 -0
  15. omniscout-0.1.0/harness/commands/profile.py +47 -0
  16. omniscout-0.1.0/harness/commands/research.py +44 -0
  17. omniscout-0.1.0/harness/commands/search.py +154 -0
  18. omniscout-0.1.0/harness/commands/session.py +75 -0
  19. omniscout-0.1.0/harness/commands/workflow.py +100 -0
  20. omniscout-0.1.0/harness/config.py +122 -0
  21. omniscout-0.1.0/harness/daemon/__init__.py +10 -0
  22. omniscout-0.1.0/harness/daemon/__main__.py +17 -0
  23. omniscout-0.1.0/harness/daemon/backends/__init__.py +215 -0
  24. omniscout-0.1.0/harness/daemon/backends/extension.py +457 -0
  25. omniscout-0.1.0/harness/daemon/backends/playwright.py +1244 -0
  26. omniscout-0.1.0/harness/daemon/captcha.py +203 -0
  27. omniscout-0.1.0/harness/daemon/events.py +111 -0
  28. omniscout-0.1.0/harness/daemon/history.py +244 -0
  29. omniscout-0.1.0/harness/daemon/lifecycle.py +252 -0
  30. omniscout-0.1.0/harness/daemon/protocol.py +90 -0
  31. omniscout-0.1.0/harness/daemon/server.py +591 -0
  32. omniscout-0.1.0/harness/daemon/sessions_store.py +127 -0
  33. omniscout-0.1.0/harness/daemon/snapshot.py +224 -0
  34. omniscout-0.1.0/harness/engines/__init__.py +0 -0
  35. omniscout-0.1.0/harness/engines/browser.py +397 -0
  36. omniscout-0.1.0/harness/engines/crawler.py +276 -0
  37. omniscout-0.1.0/harness/engines/extractor.py +172 -0
  38. omniscout-0.1.0/harness/engines/research.py +199 -0
  39. omniscout-0.1.0/harness/engines/search/__init__.py +0 -0
  40. omniscout-0.1.0/harness/engines/search/answer.py +102 -0
  41. omniscout-0.1.0/harness/engines/search/answer_balanced.py +200 -0
  42. omniscout-0.1.0/harness/engines/search/answer_cache.py +116 -0
  43. omniscout-0.1.0/harness/engines/search/answer_deep.py +101 -0
  44. omniscout-0.1.0/harness/engines/search/answer_eval.py +69 -0
  45. omniscout-0.1.0/harness/engines/search/answer_fast.py +95 -0
  46. omniscout-0.1.0/harness/engines/search/answer_text.py +366 -0
  47. omniscout-0.1.0/harness/engines/search/ddg.py +128 -0
  48. omniscout-0.1.0/harness/engines/search/embed.py +125 -0
  49. omniscout-0.1.0/harness/engines/search/index.py +152 -0
  50. omniscout-0.1.0/harness/engines/search/pipeline.py +106 -0
  51. omniscout-0.1.0/harness/engines/search/query_classifier.py +59 -0
  52. omniscout-0.1.0/harness/engines/search/rerank.py +38 -0
  53. omniscout-0.1.0/harness/io.py +108 -0
  54. omniscout-0.1.0/harness/logging.py +99 -0
  55. omniscout-0.1.0/harness/models.py +257 -0
  56. omniscout-0.1.0/harness/store/__init__.py +0 -0
  57. omniscout-0.1.0/harness/store/cache.py +112 -0
  58. omniscout-0.1.0/harness/store/sessions.py +155 -0
  59. omniscout-0.1.0/harness/store/workflow.py +135 -0
  60. omniscout-0.1.0/pyproject.toml +79 -0
@@ -0,0 +1 @@
1
+ 3.11
@@ -0,0 +1,19 @@
1
+ # Changelog
2
+
3
+ All notable changes to the `omniscout` Python package are documented in this
4
+ file.
5
+
6
+ ## [0.1.0] - 2026-05-28
7
+
8
+ ### Added
9
+
10
+ - Initial public release scaffolding for PyPI.
11
+ - GitHub Actions CI for Linux/macOS tests, build checks, and install smoke test.
12
+ - TestPyPI and PyPI trusted publishing workflows via GitHub OIDC.
13
+ - Release process documentation in `RELEASING.md`.
14
+
15
+ ### Changed
16
+
17
+ - Renamed distribution package from `harness-harness` to `omniscout`.
18
+ - Set `omniscout` as the primary console entrypoint and kept `harness` alias.
19
+ - Added project URL metadata for PyPI.
@@ -0,0 +1,197 @@
1
+ Metadata-Version: 2.4
2
+ Name: omniscout
3
+ Version: 0.1.0
4
+ Summary: OmniScout CLI (harness): local-first browser automation, semantic search, and research for AI agents
5
+ Project-URL: Homepage, https://github.com/sriramramnath/omniscout
6
+ Project-URL: Repository, https://github.com/sriramramnath/omniscout
7
+ Project-URL: Issues, https://github.com/sriramramnath/omniscout/issues
8
+ Author: OmniScout
9
+ License: MIT
10
+ Keywords: agent,cli,playwright,research,scraping,search
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Environment :: Console
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Requires-Python: >=3.11
20
+ Requires-Dist: aiohttp>=3.9
21
+ Requires-Dist: httpx>=0.27
22
+ Requires-Dist: markdownify>=0.13
23
+ Requires-Dist: nltk>=3.8
24
+ Requires-Dist: platformdirs>=4.2
25
+ Requires-Dist: playwright>=1.45
26
+ Requires-Dist: pydantic>=2.7
27
+ Requires-Dist: qdrant-client>=1.9
28
+ Requires-Dist: rich>=13.7
29
+ Requires-Dist: selectolax>=0.3.21
30
+ Requires-Dist: sentence-transformers>=2.7
31
+ Requires-Dist: sumy>=0.11
32
+ Requires-Dist: tomli>=2.0; python_version < '3.11'
33
+ Requires-Dist: trafilatura>=1.12
34
+ Requires-Dist: typer>=0.12
35
+ Provides-Extra: dev
36
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
37
+ Requires-Dist: pytest>=8; extra == 'dev'
38
+ Requires-Dist: respx>=0.21; extra == 'dev'
39
+ Requires-Dist: ruff>=0.5; extra == 'dev'
40
+ Description-Content-Type: text/markdown
41
+
42
+ # scout — OmniScout CLI
43
+
44
+ Local-first browser automation, semantic search, and research for AI agents.
45
+ No cloud APIs, no hosted browser sessions, no MCP, no SDK.
46
+
47
+ The CLI is the interface.
48
+
49
+ ## Install
50
+
51
+ Requires Python 3.11+ and Google Chrome (already installed on most macOS
52
+ machines at `/Applications/Google Chrome.app`).
53
+
54
+ ### Recommended: install as a global tool
55
+
56
+ ```bash
57
+ cd cli
58
+ uv tool install --editable . # creates ~/.local/bin/omniscout on PATH
59
+ scout install # verifies Chrome + prefetches embedding model
60
+ ```
61
+
62
+ After this, `scout` works from any directory. Edits to source files are
63
+ picked up live (editable install).
64
+
65
+ `omniscout` and `harness` remain available as compatibility aliases.
66
+
67
+ ### Alternative: project venv
68
+
69
+ If you prefer not to install globally:
70
+
71
+ ```bash
72
+ cd cli
73
+ uv venv --python 3.11 .venv
74
+ uv pip install -e ".[dev]"
75
+ source .venv/bin/activate
76
+ scout install
77
+ ```
78
+
79
+ If you don't have Chrome installed, add `--bundled` to also download
80
+ Playwright's bundled Chromium (~190MB).
81
+
82
+ `scout install` also prefetches the local sentence-transformers model into
83
+ OmniScout's app data directory so later commands do not need to fetch it again.
84
+ Use `--no-model` to skip model prefetch.
85
+
86
+ ## Quickstart
87
+
88
+ ```bash
89
+ # Search the web (DuckDuckGo HTML + local embedding rerank)
90
+ scout search "local-first browser agents"
91
+ # same command via alias:
92
+ scout search "local-first browser agents"
93
+
94
+ # Extract a URL to clean Markdown
95
+ scout extract https://example.com
96
+
97
+ # Capture a screenshot of a real page using your installed Chrome
98
+ scout browser screenshot https://example.com --out page.png
99
+
100
+ # Run a multi-step research pipeline (search -> crawl -> extract -> rerank -> summarize)
101
+ scout research "state of local AI agents in 2026"
102
+
103
+ # Manage persistent browser profiles (cookies, logins persist across runs)
104
+ scout profile create work
105
+ scout browser open https://news.ycombinator.com --profile work --headful
106
+
107
+ # Long-lived browser sessions (other tools can attach via CDP)
108
+ scout session start --headful
109
+ scout session list
110
+ scout session kill --all
111
+ ```
112
+
113
+ ## JSON output (for agents)
114
+
115
+ Every command emits structured JSON when invoked with `--json` (or with
116
+ `HARNESS_JSON=1` in the environment). Logs always go to stderr; stdout is
117
+ reserved for the structured result.
118
+
119
+ ```bash
120
+ HARNESS_JSON=1 scout search "robotics simulators" --limit 5
121
+ ```
122
+
123
+ ## Architecture
124
+
125
+ ```
126
+ harness/
127
+ app.py # Typer root
128
+ commands/ # CLI sub-commands (thin)
129
+ engines/
130
+ browser.py # Playwright + system Chrome
131
+ extractor.py # trafilatura + markdownify
132
+ crawler.py # async httpx + Chrome fallback
133
+ search/
134
+ ddg.py # DuckDuckGo HTML
135
+ embed.py # sentence-transformers (all-MiniLM-L6-v2)
136
+ index.py # embedded Qdrant on-disk
137
+ rerank.py # cosine rerank
138
+ pipeline.py # ddg | index | hybrid
139
+ research.py # full pipeline (search -> crawl -> extract -> rerank -> summarize)
140
+ store/
141
+ cache.py # SQLite + content-hashed HTML cache
142
+ sessions.py # SQLite registry of browser sessions
143
+ models.py # pydantic result types (the JSON contract)
144
+ ```
145
+
146
+ On-disk state lives under `~/Library/Application Support/harness/` (macOS) /
147
+ `$XDG_DATA_HOME/harness/` (Linux):
148
+
149
+ | Path | Purpose |
150
+ | ---------------- | -------------------------------------------------- |
151
+ | `profiles/` | Persistent Chrome user-data-dirs |
152
+ | `qdrant/` | Embedded vector index |
153
+ | `sessions.sqlite`| Registry of long-lived browser sessions |
154
+ | `cache/pages/` | Content-hashed HTML cache used by extract+crawler |
155
+
156
+ Override via `HARNESS_DATA_DIR`, `HARNESS_CONFIG_DIR`, `HARNESS_CACHE_DIR`,
157
+ or settings in `~/Library/Application Support/harness/config.toml`.
158
+
159
+ ## Configuration
160
+
161
+ `config.toml` example:
162
+
163
+ ```toml
164
+ default_source = "ddg" # search source default
165
+ search_limit = 10
166
+ research_results = 8
167
+ request_throttle_seconds = 1.0 # per-host throttle in the crawler
168
+ embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
169
+ embedding_local_only = true # default; never fetch model files at query time
170
+ browser_channel = "chrome" # uses installed Google Chrome
171
+ # browser_executable = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
172
+ summary_sentences = 6
173
+ ```
174
+
175
+ Set `HARNESS_EMBED_LOCAL_ONLY=0` to allow runtime Hugging Face fetches.
176
+
177
+ ## Testing
178
+
179
+ ```bash
180
+ .venv/bin/pytest
181
+ ```
182
+
183
+ Tests run offline — search, extract, and the research pipeline are all
184
+ exercised against saved HTML fixtures and patched network seams.
185
+
186
+ ## Why local Chrome?
187
+
188
+ Using your system Chrome (channel = "chrome") gives you:
189
+
190
+ - Real cookies, login state, extensions, and font rendering
191
+ - No extra ~190MB Chromium download
192
+ - The same user-agent fingerprint as your daily browsing
193
+ - Cleaner integration with `omniscout session start` for long-lived sessions
194
+ that other tools can attach to over CDP
195
+
196
+ If Chrome isn't available, the engine transparently falls back to Playwright's
197
+ bundled Chromium.
@@ -0,0 +1,156 @@
1
+ # scout — OmniScout CLI
2
+
3
+ Local-first browser automation, semantic search, and research for AI agents.
4
+ No cloud APIs, no hosted browser sessions, no MCP, no SDK.
5
+
6
+ The CLI is the interface.
7
+
8
+ ## Install
9
+
10
+ Requires Python 3.11+ and Google Chrome (already installed on most macOS
11
+ machines at `/Applications/Google Chrome.app`).
12
+
13
+ ### Recommended: install as a global tool
14
+
15
+ ```bash
16
+ cd cli
17
+ uv tool install --editable . # creates ~/.local/bin/omniscout on PATH
18
+ scout install # verifies Chrome + prefetches embedding model
19
+ ```
20
+
21
+ After this, `scout` works from any directory. Edits to source files are
22
+ picked up live (editable install).
23
+
24
+ `omniscout` and `harness` remain available as compatibility aliases.
25
+
26
+ ### Alternative: project venv
27
+
28
+ If you prefer not to install globally:
29
+
30
+ ```bash
31
+ cd cli
32
+ uv venv --python 3.11 .venv
33
+ uv pip install -e ".[dev]"
34
+ source .venv/bin/activate
35
+ scout install
36
+ ```
37
+
38
+ If you don't have Chrome installed, add `--bundled` to also download
39
+ Playwright's bundled Chromium (~190MB).
40
+
41
+ `scout install` also prefetches the local sentence-transformers model into
42
+ OmniScout's app data directory so later commands do not need to fetch it again.
43
+ Use `--no-model` to skip model prefetch.
44
+
45
+ ## Quickstart
46
+
47
+ ```bash
48
+ # Search the web (DuckDuckGo HTML + local embedding rerank)
49
+ scout search "local-first browser agents"
50
+ # same command via alias:
51
+ scout search "local-first browser agents"
52
+
53
+ # Extract a URL to clean Markdown
54
+ scout extract https://example.com
55
+
56
+ # Capture a screenshot of a real page using your installed Chrome
57
+ scout browser screenshot https://example.com --out page.png
58
+
59
+ # Run a multi-step research pipeline (search -> crawl -> extract -> rerank -> summarize)
60
+ scout research "state of local AI agents in 2026"
61
+
62
+ # Manage persistent browser profiles (cookies, logins persist across runs)
63
+ scout profile create work
64
+ scout browser open https://news.ycombinator.com --profile work --headful
65
+
66
+ # Long-lived browser sessions (other tools can attach via CDP)
67
+ scout session start --headful
68
+ scout session list
69
+ scout session kill --all
70
+ ```
71
+
72
+ ## JSON output (for agents)
73
+
74
+ Every command emits structured JSON when invoked with `--json` (or with
75
+ `HARNESS_JSON=1` in the environment). Logs always go to stderr; stdout is
76
+ reserved for the structured result.
77
+
78
+ ```bash
79
+ HARNESS_JSON=1 scout search "robotics simulators" --limit 5
80
+ ```
81
+
82
+ ## Architecture
83
+
84
+ ```
85
+ harness/
86
+ app.py # Typer root
87
+ commands/ # CLI sub-commands (thin)
88
+ engines/
89
+ browser.py # Playwright + system Chrome
90
+ extractor.py # trafilatura + markdownify
91
+ crawler.py # async httpx + Chrome fallback
92
+ search/
93
+ ddg.py # DuckDuckGo HTML
94
+ embed.py # sentence-transformers (all-MiniLM-L6-v2)
95
+ index.py # embedded Qdrant on-disk
96
+ rerank.py # cosine rerank
97
+ pipeline.py # ddg | index | hybrid
98
+ research.py # full pipeline (search -> crawl -> extract -> rerank -> summarize)
99
+ store/
100
+ cache.py # SQLite + content-hashed HTML cache
101
+ sessions.py # SQLite registry of browser sessions
102
+ models.py # pydantic result types (the JSON contract)
103
+ ```
104
+
105
+ On-disk state lives under `~/Library/Application Support/harness/` (macOS) /
106
+ `$XDG_DATA_HOME/harness/` (Linux):
107
+
108
+ | Path | Purpose |
109
+ | ---------------- | -------------------------------------------------- |
110
+ | `profiles/` | Persistent Chrome user-data-dirs |
111
+ | `qdrant/` | Embedded vector index |
112
+ | `sessions.sqlite`| Registry of long-lived browser sessions |
113
+ | `cache/pages/` | Content-hashed HTML cache used by extract+crawler |
114
+
115
+ Override via `HARNESS_DATA_DIR`, `HARNESS_CONFIG_DIR`, `HARNESS_CACHE_DIR`,
116
+ or settings in `~/Library/Application Support/harness/config.toml`.
117
+
118
+ ## Configuration
119
+
120
+ `config.toml` example:
121
+
122
+ ```toml
123
+ default_source = "ddg" # search source default
124
+ search_limit = 10
125
+ research_results = 8
126
+ request_throttle_seconds = 1.0 # per-host throttle in the crawler
127
+ embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
128
+ embedding_local_only = true # default; never fetch model files at query time
129
+ browser_channel = "chrome" # uses installed Google Chrome
130
+ # browser_executable = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
131
+ summary_sentences = 6
132
+ ```
133
+
134
+ Set `HARNESS_EMBED_LOCAL_ONLY=0` to allow runtime Hugging Face fetches.
135
+
136
+ ## Testing
137
+
138
+ ```bash
139
+ .venv/bin/pytest
140
+ ```
141
+
142
+ Tests run offline — search, extract, and the research pipeline are all
143
+ exercised against saved HTML fixtures and patched network seams.
144
+
145
+ ## Why local Chrome?
146
+
147
+ Using your system Chrome (channel = "chrome") gives you:
148
+
149
+ - Real cookies, login state, extensions, and font rendering
150
+ - No extra ~190MB Chromium download
151
+ - The same user-agent fingerprint as your daily browsing
152
+ - Cleaner integration with `omniscout session start` for long-lived sessions
153
+ that other tools can attach to over CDP
154
+
155
+ If Chrome isn't available, the engine transparently falls back to Playwright's
156
+ bundled Chromium.
@@ -0,0 +1,73 @@
1
+ # Releasing `omniscout` to PyPI
2
+
3
+ This project publishes the Python CLI package from `cli/` using GitHub Actions
4
+ OIDC trusted publishing.
5
+
6
+ ## Package identity
7
+
8
+ - Distribution name: `omniscout`
9
+ - Console commands:
10
+ - `omniscout` (primary)
11
+ - `scout` (alias)
12
+ - `harness` (compatibility alias)
13
+
14
+ ## Trusted publisher configuration
15
+
16
+ Configure on **pypi.org** (production):
17
+
18
+ - PyPI project name: `omniscout`
19
+ - Owner: `sriramramnath`
20
+ - Repository: `omniscout`
21
+ - Workflow: `pypi-publish.yml`
22
+ - Environment: `pypi`
23
+
24
+ If workflow file name or environment changes, update the trusted publisher entry
25
+ to match exactly before publishing.
26
+
27
+ TestPyPI is optional and not wired in CI right now. Add a `testpypi-publish.yml`
28
+ workflow and register a matching publisher on test.pypi.org when you want a
29
+ staging index later.
30
+
31
+ ## Versioning strategy
32
+
33
+ Use semantic versioning:
34
+
35
+ - `MAJOR`: breaking CLI/JSON contract changes.
36
+ - `MINOR`: backward-compatible features.
37
+ - `PATCH`: backward-compatible fixes/docs/internal improvements.
38
+
39
+ Release flow:
40
+
41
+ 1. Update version in `pyproject.toml` and `harness/__init__.py`.
42
+ 2. Add a new section in `CHANGELOG.md`.
43
+ 3. Open PR and ensure CI passes.
44
+ 4. Create and push tag `vX.Y.Z` (runs CI only).
45
+ 5. Run `publish-pypi` workflow manually with `ref=vX.Y.Z` and approve the
46
+ `pypi` environment.
47
+
48
+ ## Dependency pinning strategy
49
+
50
+ - Runtime dependencies are lower-bounded for compatibility and should remain
51
+ narrow on major version boundaries where practical.
52
+ - Add upper bounds only when an upstream major release is known to break the
53
+ CLI.
54
+ - For reproducible local development, use a lock file (`uv.lock`) in contributor
55
+ workflows; do not publish pinned transitive dependencies in the package
56
+ metadata unless required.
57
+
58
+ ## Release notes / changelog discipline
59
+
60
+ - Every release must include a `CHANGELOG.md` entry.
61
+ - Keep entries grouped under `Added`, `Changed`, `Fixed`, `Removed`.
62
+ - Link the tag to corresponding GitHub release notes.
63
+
64
+ ## First release checklist
65
+
66
+ 1. Confirm `omniscout` is available/reserved on PyPI.
67
+ 2. Confirm trusted publisher entry is configured on pypi.org.
68
+ 3. Ensure GitHub environment `pypi` has required reviewers (manual approval).
69
+ 4. Push tag `vX.Y.Z` and wait for CI to pass.
70
+ 5. Run `publish-pypi` workflow manually with `ref=vX.Y.Z` and approve `pypi`.
71
+ 6. Verify production install:
72
+ - `pip install omniscout`
73
+ - `scout --help`
@@ -0,0 +1,3 @@
1
+ """OmniScout `harness` CLI package."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,6 @@
1
+ """Allow `python -m harness` to invoke the CLI."""
2
+
3
+ from harness.app import main
4
+
5
+ if __name__ == "__main__":
6
+ main()