dumbwebsearch 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dumbwebsearch-0.1.0/PKG-INFO +58 -0
- dumbwebsearch-0.1.0/README.md +31 -0
- dumbwebsearch-0.1.0/dumbwebsearch/__init__.py +1 -0
- dumbwebsearch-0.1.0/dumbwebsearch/browse.py +83 -0
- dumbwebsearch-0.1.0/dumbwebsearch/index.py +113 -0
- dumbwebsearch-0.1.0/pyproject.toml +55 -0
- dumbwebsearch-0.1.0/requirements.txt +2 -0
- dumbwebsearch-0.1.0/tests/test_index.py +21 -0
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dumbwebsearch
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: dumb web interface to search/browse indexed text files
|
|
5
|
+
Project-URL: Homepage, https://gitlab.com/hydrargyrum/dumbsearch
|
|
6
|
+
Author-email: Hg <dev@indigo.re>
|
|
7
|
+
License-Expression: WTFPL
|
|
8
|
+
Keywords: search,server,web
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: Environment :: Web Environment
|
|
11
|
+
Classifier: Framework :: Bottle
|
|
12
|
+
Classifier: Intended Audience :: End Users/Desktop
|
|
13
|
+
Classifier: License :: CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
|
|
14
|
+
Classifier: License :: Public Domain
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: WSGI :: Application
|
|
23
|
+
Requires-Python: >=3
|
|
24
|
+
Requires-Dist: bottle
|
|
25
|
+
Requires-Dist: jinja2
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
# dumb web search
|
|
29
|
+
|
|
30
|
+
- index a directory of text files in sqlite database
|
|
31
|
+
- web browse this directory
|
|
32
|
+
- web search this directory with indexed text
|
|
33
|
+
|
|
34
|
+
## how to run
|
|
35
|
+
|
|
36
|
+
```
|
|
37
|
+
./index.py --docroot /path/to/markdown/dir --database /path/to/database.sqlite index
|
|
38
|
+
|
|
39
|
+
HTTP_SERVER_BASEURL=/reverse/proxy/subpath HTTP_SERVER_PORT=1234 INDEX_DIR=/path/to/markdown/dir INDEX_DATABASE=/path/to/database.sqlite ./browse.wsgi
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## requirements
|
|
43
|
+
|
|
44
|
+
- python3
|
|
45
|
+
- sqlite to index/search text
|
|
46
|
+
- wsgi to expose said app
|
|
47
|
+
- jinja2 + bottle as web framework
|
|
48
|
+
- pandoc
|
|
49
|
+
|
|
50
|
+
## security
|
|
51
|
+
|
|
52
|
+
- no authentication: put a reverse proxy in front of it with some login
|
|
53
|
+
- no optimization for performance: put a reverse proxy to prevent hammering
|
|
54
|
+
- pandoc is run to render markdown files, which may have its own problems
|
|
55
|
+
|
|
56
|
+
## what could be better done
|
|
57
|
+
|
|
58
|
+
- don't make markdown so much ingrained in this app
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# dumb web search
|
|
2
|
+
|
|
3
|
+
- index a directory of text files in sqlite database
|
|
4
|
+
- web browse this directory
|
|
5
|
+
- web search this directory with indexed text
|
|
6
|
+
|
|
7
|
+
## how to run
|
|
8
|
+
|
|
9
|
+
```
|
|
10
|
+
./index.py --docroot /path/to/markdown/dir --database /path/to/database.sqlite index
|
|
11
|
+
|
|
12
|
+
HTTP_SERVER_BASEURL=/reverse/proxy/subpath HTTP_SERVER_PORT=1234 INDEX_DIR=/path/to/markdown/dir INDEX_DATABASE=/path/to/database.sqlite ./browse.wsgi
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## requirements
|
|
16
|
+
|
|
17
|
+
- python3
|
|
18
|
+
- sqlite to index/search text
|
|
19
|
+
- wsgi to expose said app
|
|
20
|
+
- jinja2 + bottle as web framework
|
|
21
|
+
- pandoc
|
|
22
|
+
|
|
23
|
+
## security
|
|
24
|
+
|
|
25
|
+
- no authentication: put a reverse proxy in front of it with some login
|
|
26
|
+
- no optimization for performance: put a reverse proxy to prevent hammering
|
|
27
|
+
- pandoc is run to render markdown files, which may have its own problems
|
|
28
|
+
|
|
29
|
+
## what could be better done
|
|
30
|
+
|
|
31
|
+
- don't make markdown so much ingrained in this app
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# SPDX-License-Identifier: WTFPL
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import shutil
|
|
7
|
+
import subprocess
|
|
8
|
+
|
|
9
|
+
import bottle
|
|
10
|
+
|
|
11
|
+
from . import index
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
APP = bottle.Bottle()
|
|
15
|
+
|
|
16
|
+
BASEURL = os.environ.get("HTTP_SERVER_BASEURL", "")
|
|
17
|
+
|
|
18
|
+
JPAGE = """
|
|
19
|
+
<!DOCTYPE html>
|
|
20
|
+
<html>
|
|
21
|
+
<head>
|
|
22
|
+
<meta charset="utf-8" />
|
|
23
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
|
24
|
+
<title>Document search</title>
|
|
25
|
+
</head>
|
|
26
|
+
<body>
|
|
27
|
+
<form method="post" action="{{ baseurl }}/search">
|
|
28
|
+
<input name="term" value="{{ search | espace }}" />
|
|
29
|
+
</form>
|
|
30
|
+
<hr/>
|
|
31
|
+
{% for res in results %}
|
|
32
|
+
<li><a href="{{ baseurl }}/{{ res[0] | urlencode }}">
|
|
33
|
+
{{- res[0] | escape -}}
|
|
34
|
+
</a>
|
|
35
|
+
{%- if res[1] %} -- {{ res[1] | escape }}
|
|
36
|
+
{%- endif -%}
|
|
37
|
+
</li>
|
|
38
|
+
{% endfor %}
|
|
39
|
+
</body>
|
|
40
|
+
</html>
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
ROOT = Path(os.environ["INDEX_DIR"])
|
|
44
|
+
|
|
45
|
+
DB = index.Db(ROOT)
|
|
46
|
+
DB.open(os.environ["INDEX_DATABASE"])
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@APP.get("/")
|
|
50
|
+
def root():
|
|
51
|
+
return bottle.jinja2_template(JPAGE, search="", results=sorted([
|
|
52
|
+
(f.name, "")
|
|
53
|
+
for f in ROOT.glob("*.md")
|
|
54
|
+
]), baseurl=BASEURL)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@APP.post("/search")
|
|
58
|
+
def search():
|
|
59
|
+
term = bottle.request.POST["term"].strip()
|
|
60
|
+
if not term:
|
|
61
|
+
return bottle.redirect(f"{BASEURL}/")
|
|
62
|
+
|
|
63
|
+
results = list(DB.searchiter(term))
|
|
64
|
+
return bottle.jinja2_template(
|
|
65
|
+
JPAGE, results=results, search=term, baseurl=BASEURL
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@APP.get("/<name>.md")
|
|
70
|
+
def getfile(name):
|
|
71
|
+
if "." in name or "/" in name:
|
|
72
|
+
return bottle.abort(403)
|
|
73
|
+
|
|
74
|
+
if shutil.which("pandoc"):
|
|
75
|
+
return subprocess.check_output([
|
|
76
|
+
"pandoc", "--sandbox=true", "-s", f"{ROOT}/{name}.md"
|
|
77
|
+
])
|
|
78
|
+
|
|
79
|
+
return bottle.static_file(f"{name}.md", str(ROOT))
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
if __name__ == "__main__":
|
|
83
|
+
APP.run(port=int(os.environ.get("HTTP_SERVER_PORT", 3046)))
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# SPDX-License-Identifier: WTFPL
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import sqlite3
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Db:
|
|
10
|
+
def __init__(self, docroot):
|
|
11
|
+
self.docroot = docroot
|
|
12
|
+
self.db = None
|
|
13
|
+
|
|
14
|
+
def index(self):
|
|
15
|
+
files = {
|
|
16
|
+
sub.name: sub.stat().st_mtime
|
|
17
|
+
for sub in self.docroot.glob("*.md")
|
|
18
|
+
}
|
|
19
|
+
indb = {
|
|
20
|
+
row["filename"]: row
|
|
21
|
+
for row in self.db.execute("SELECT rowid, filename, mtime FROM file")
|
|
22
|
+
}
|
|
23
|
+
rowids = set()
|
|
24
|
+
for name, mtime in files.items():
|
|
25
|
+
if name not in indb:
|
|
26
|
+
rowid = self._indexfile(name)
|
|
27
|
+
rowids.add(rowid)
|
|
28
|
+
else:
|
|
29
|
+
if mtime > indb[name]["mtime"]:
|
|
30
|
+
self._indexfile(name, indb[name]["rowid"])
|
|
31
|
+
rowids.add(indb[name]["rowid"])
|
|
32
|
+
self.db.executemany(
|
|
33
|
+
"""
|
|
34
|
+
DELETE FROM text WHERE rowid = ?
|
|
35
|
+
""",
|
|
36
|
+
[(rowid,) for rowid in {row["rowid"] for row in indb.values()} - rowids]
|
|
37
|
+
)
|
|
38
|
+
self.db.commit()
|
|
39
|
+
|
|
40
|
+
def _indexfile(self, name, rowid=None):
|
|
41
|
+
# insert or update returning
|
|
42
|
+
path = self.docroot / name
|
|
43
|
+
mtime = path.stat().st_mtime
|
|
44
|
+
body = path.read_text()
|
|
45
|
+
if rowid is None:
|
|
46
|
+
self.db.execute("INSERT INTO file(filename, mtime) VALUES(?, ?)", (name, mtime))
|
|
47
|
+
((rowid,),) = self.db.execute("SELECT rowid FROM file WHERE filename = ?", (name,))
|
|
48
|
+
self.db.execute("INSERT INTO text(rowid, body) VALUES(?, ?)", (rowid, body))
|
|
49
|
+
else:
|
|
50
|
+
self.db.execute("UPDATE file SET mtime = ? WHERE rowid = ?", (mtime, rowid))
|
|
51
|
+
self.db.execute("UPDATE text SET body = ? WHERE rowid = ?", (body, rowid))
|
|
52
|
+
return rowid
|
|
53
|
+
|
|
54
|
+
def search(self, term):
|
|
55
|
+
for row in self.searchiter(term):
|
|
56
|
+
print("=", row["filename"], "=" * 30)
|
|
57
|
+
print(row[1])
|
|
58
|
+
print()
|
|
59
|
+
|
|
60
|
+
def searchiter(self, term):
|
|
61
|
+
for row in self.db.execute(
|
|
62
|
+
"""
|
|
63
|
+
SELECT filename, snippet(text, 0, '[', ']', '...', 10)
|
|
64
|
+
FROM text JOIN file ON text.rowid = file.rowid
|
|
65
|
+
WHERE body MATCH ? ORDER BY rank
|
|
66
|
+
""",
|
|
67
|
+
(term,)
|
|
68
|
+
):
|
|
69
|
+
yield row
|
|
70
|
+
|
|
71
|
+
def open(self, dbpath):
|
|
72
|
+
self.db = sqlite3.connect(dbpath, autocommit=False)
|
|
73
|
+
self.db.row_factory = sqlite3.Row
|
|
74
|
+
|
|
75
|
+
def initdb(self):
|
|
76
|
+
self.db.execute(
|
|
77
|
+
"CREATE TABLE IF NOT EXISTS file(filename TEXT, mtime INTEGER)"
|
|
78
|
+
)
|
|
79
|
+
self.db.execute(
|
|
80
|
+
"CREATE VIRTUAL TABLE IF NOT EXISTS text USING fts5(body)"
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def main():
|
|
85
|
+
parser = argparse.ArgumentParser()
|
|
86
|
+
|
|
87
|
+
parser.add_argument("--database", type=Path, default="index.sqlite")
|
|
88
|
+
parser.add_argument("--docroot", type=Path, default=Path.cwd())
|
|
89
|
+
|
|
90
|
+
subs = parser.add_subparsers(dest="subcommand", required=True)
|
|
91
|
+
|
|
92
|
+
sub = subs.add_parser("index")
|
|
93
|
+
sub.add_argument("--prune", action="store_true")
|
|
94
|
+
|
|
95
|
+
sub = subs.add_parser("search")
|
|
96
|
+
sub.add_argument("term")
|
|
97
|
+
|
|
98
|
+
args = parser.parse_args()
|
|
99
|
+
|
|
100
|
+
db = Db(args.docroot)
|
|
101
|
+
db.open(args.database)
|
|
102
|
+
db.initdb()
|
|
103
|
+
|
|
104
|
+
if args.subcommand == "search":
|
|
105
|
+
db.search(args.term)
|
|
106
|
+
elif args.subcommand == "index":
|
|
107
|
+
db.index()
|
|
108
|
+
else:
|
|
109
|
+
raise NotImplementedError()
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
if __name__ == "__main__":
|
|
113
|
+
main()
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "dumbwebsearch"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "dumb web interface to search/browse indexed text files"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "WTFPL"
|
|
11
|
+
requires-python = ">=3"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Hg", email = "dev@indigo.re" },
|
|
14
|
+
]
|
|
15
|
+
keywords = [
|
|
16
|
+
"server",
|
|
17
|
+
"search",
|
|
18
|
+
"web",
|
|
19
|
+
]
|
|
20
|
+
classifiers = [
|
|
21
|
+
"Development Status :: 4 - Beta",
|
|
22
|
+
"Environment :: Web Environment",
|
|
23
|
+
"Framework :: Bottle",
|
|
24
|
+
"Intended Audience :: End Users/Desktop",
|
|
25
|
+
"License :: CC0 1.0 Universal (CC0 1.0) Public Domain Dedication",
|
|
26
|
+
"License :: Public Domain",
|
|
27
|
+
"Programming Language :: Python :: 3",
|
|
28
|
+
"Programming Language :: Python :: 3.8",
|
|
29
|
+
"Programming Language :: Python :: 3.9",
|
|
30
|
+
"Programming Language :: Python :: 3.10",
|
|
31
|
+
"Programming Language :: Python :: 3.11",
|
|
32
|
+
"Programming Language :: Python :: 3.12",
|
|
33
|
+
"Programming Language :: Python :: 3.13",
|
|
34
|
+
"Topic :: Internet :: WWW/HTTP :: WSGI :: Application",
|
|
35
|
+
]
|
|
36
|
+
dependencies = [
|
|
37
|
+
"bottle",
|
|
38
|
+
"Jinja2",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
[project.scripts]
|
|
42
|
+
dumbwebsearch-index = "dumbsearch.index:main"
|
|
43
|
+
|
|
44
|
+
[project.entry-points."wsgi"]
|
|
45
|
+
dumbwebsearch = "dumbwebsearch.browse:APP"
|
|
46
|
+
|
|
47
|
+
[project.urls]
|
|
48
|
+
Homepage = "https://gitlab.com/hydrargyrum/dumbsearch"
|
|
49
|
+
|
|
50
|
+
[tool.hatch.version]
|
|
51
|
+
path = "dumbwebsearch/__init__.py"
|
|
52
|
+
|
|
53
|
+
# requires flake8-pyproject
|
|
54
|
+
[tool.flake8]
|
|
55
|
+
max-line-length = 120
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
|
|
2
|
+
import dumbwebsearch.index
|
|
3
|
+
|
|
4
|
+
import subprocess
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def run(args):
|
|
8
|
+
return subprocess.check_output([dumbwebsearch.index.__file__, *args], encoding="utf8")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def test_basic(tmp_path):
|
|
12
|
+
(tmp_path / "foo.md").write_text("some text")
|
|
13
|
+
|
|
14
|
+
run(["--database", str(tmp_path / "db"), "--docroot", str(tmp_path), "index"])
|
|
15
|
+
|
|
16
|
+
result = run(["--database", str(tmp_path / "db"), "--docroot", str(tmp_path), "search", "failure"])
|
|
17
|
+
assert result == ""
|
|
18
|
+
|
|
19
|
+
result = run(["--database", str(tmp_path / "db"), "--docroot", str(tmp_path), "search", "text"])
|
|
20
|
+
assert result.strip().split("\n") == ["= foo.md ==============================", "some [text]"]
|
|
21
|
+
|