dumbwebsearch 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,58 @@
1
+ Metadata-Version: 2.4
2
+ Name: dumbwebsearch
3
+ Version: 0.1.0
4
+ Summary: dumb web interface to search/browse indexed text files
5
+ Project-URL: Homepage, https://gitlab.com/hydrargyrum/dumbsearch
6
+ Author-email: Hg <dev@indigo.re>
7
+ License-Expression: WTFPL
8
+ Keywords: search,server,web
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Environment :: Web Environment
11
+ Classifier: Framework :: Bottle
12
+ Classifier: Intended Audience :: End Users/Desktop
13
+ Classifier: License :: CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
14
+ Classifier: License :: Public Domain
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.8
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Topic :: Internet :: WWW/HTTP :: WSGI :: Application
23
+ Requires-Python: >=3
24
+ Requires-Dist: bottle
25
+ Requires-Dist: jinja2
26
+ Description-Content-Type: text/markdown
27
+
28
+ # dumb web search
29
+
30
+ - index a directory of text files in sqlite database
31
+ - web browse this directory
32
+ - web search this directory with indexed text
33
+
34
+ ## how to run
35
+
36
+ ```
37
+ ./index.py --docroot /path/to/markdown/dir --database /path/to/database.sqlite index
38
+
39
+ HTTP_SERVER_BASEURL=/reverse/proxy/subpath HTTP_SERVER_PORT=1234 INDEX_DIR=/path/to/markdown/dir INDEX_DATABASE=/path/to/database.sqlite ./browse.wsgi
40
+ ```
41
+
42
+ ## requirements
43
+
44
+ - python3
45
+ - sqlite to index/search text
46
+ - wsgi to expose said app
47
+ - jinja2 + bottle as web framework
48
+ - pandoc
49
+
50
+ ## security
51
+
52
+ - no authentication: put a reverse proxy in front of it with some login
53
+ - no optimization for performance: put a reverse proxy to prevent hammering
54
+ - pandoc is run to render markdown files, which may have its own problems
55
+
56
+ ## what could be better done
57
+
58
+ - don't make markdown so much ingrained in this app
@@ -0,0 +1,31 @@
1
+ # dumb web search
2
+
3
+ - index a directory of text files in sqlite database
4
+ - web browse this directory
5
+ - web search this directory with indexed text
6
+
7
+ ## how to run
8
+
9
+ ```
10
+ ./index.py --docroot /path/to/markdown/dir --database /path/to/database.sqlite index
11
+
12
+ HTTP_SERVER_BASEURL=/reverse/proxy/subpath HTTP_SERVER_PORT=1234 INDEX_DIR=/path/to/markdown/dir INDEX_DATABASE=/path/to/database.sqlite ./browse.wsgi
13
+ ```
14
+
15
+ ## requirements
16
+
17
+ - python3
18
+ - sqlite to index/search text
19
+ - wsgi to expose said app
20
+ - jinja2 + bottle as web framework
21
+ - pandoc
22
+
23
+ ## security
24
+
25
+ - no authentication: put a reverse proxy in front of it with some login
26
+ - no optimization for performance: put a reverse proxy to prevent hammering
27
+ - pandoc is run to render markdown files, which may have its own problems
28
+
29
+ ## what could be better done
30
+
31
+ - don't make markdown so much ingrained in this app
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
@@ -0,0 +1,83 @@
1
+ #!/usr/bin/env python3
2
+ # SPDX-License-Identifier: WTFPL
3
+
4
+ import os
5
+ from pathlib import Path
6
+ import shutil
7
+ import subprocess
8
+
9
+ import bottle
10
+
11
+ from . import index
12
+
13
+
14
+ APP = bottle.Bottle()
15
+
16
+ BASEURL = os.environ.get("HTTP_SERVER_BASEURL", "")
17
+
18
+ JPAGE = """
19
+ <!DOCTYPE html>
20
+ <html>
21
+ <head>
22
+ <meta charset="utf-8" />
23
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
24
+ <title>Document search</title>
25
+ </head>
26
+ <body>
27
+ <form method="post" action="{{ baseurl }}/search">
28
+ <input name="term" value="{{ search | espace }}" />
29
+ </form>
30
+ <hr/>
31
+ {% for res in results %}
32
+ <li><a href="{{ baseurl }}/{{ res[0] | urlencode }}">
33
+ {{- res[0] | escape -}}
34
+ </a>
35
+ {%- if res[1] %} -- {{ res[1] | escape }}
36
+ {%- endif -%}
37
+ </li>
38
+ {% endfor %}
39
+ </body>
40
+ </html>
41
+ """
42
+
43
+ ROOT = Path(os.environ["INDEX_DIR"])
44
+
45
+ DB = index.Db(ROOT)
46
+ DB.open(os.environ["INDEX_DATABASE"])
47
+
48
+
49
+ @APP.get("/")
50
+ def root():
51
+ return bottle.jinja2_template(JPAGE, search="", results=sorted([
52
+ (f.name, "")
53
+ for f in ROOT.glob("*.md")
54
+ ]), baseurl=BASEURL)
55
+
56
+
57
+ @APP.post("/search")
58
+ def search():
59
+ term = bottle.request.POST["term"].strip()
60
+ if not term:
61
+ return bottle.redirect(f"{BASEURL}/")
62
+
63
+ results = list(DB.searchiter(term))
64
+ return bottle.jinja2_template(
65
+ JPAGE, results=results, search=term, baseurl=BASEURL
66
+ )
67
+
68
+
69
+ @APP.get("/<name>.md")
70
+ def getfile(name):
71
+ if "." in name or "/" in name:
72
+ return bottle.abort(403)
73
+
74
+ if shutil.which("pandoc"):
75
+ return subprocess.check_output([
76
+ "pandoc", "--sandbox=true", "-s", f"{ROOT}/{name}.md"
77
+ ])
78
+
79
+ return bottle.static_file(f"{name}.md", str(ROOT))
80
+
81
+
82
+ if __name__ == "__main__":
83
+ APP.run(port=int(os.environ.get("HTTP_SERVER_PORT", 3046)))
@@ -0,0 +1,113 @@
1
+ #!/usr/bin/env python3
2
+ # SPDX-License-Identifier: WTFPL
3
+
4
+ import argparse
5
+ from pathlib import Path
6
+ import sqlite3
7
+
8
+
9
+ class Db:
10
+ def __init__(self, docroot):
11
+ self.docroot = docroot
12
+ self.db = None
13
+
14
+ def index(self):
15
+ files = {
16
+ sub.name: sub.stat().st_mtime
17
+ for sub in self.docroot.glob("*.md")
18
+ }
19
+ indb = {
20
+ row["filename"]: row
21
+ for row in self.db.execute("SELECT rowid, filename, mtime FROM file")
22
+ }
23
+ rowids = set()
24
+ for name, mtime in files.items():
25
+ if name not in indb:
26
+ rowid = self._indexfile(name)
27
+ rowids.add(rowid)
28
+ else:
29
+ if mtime > indb[name]["mtime"]:
30
+ self._indexfile(name, indb[name]["rowid"])
31
+ rowids.add(indb[name]["rowid"])
32
+ self.db.executemany(
33
+ """
34
+ DELETE FROM text WHERE rowid = ?
35
+ """,
36
+ [(rowid,) for rowid in {row["rowid"] for row in indb.values()} - rowids]
37
+ )
38
+ self.db.commit()
39
+
40
+ def _indexfile(self, name, rowid=None):
41
+ # insert or update returning
42
+ path = self.docroot / name
43
+ mtime = path.stat().st_mtime
44
+ body = path.read_text()
45
+ if rowid is None:
46
+ self.db.execute("INSERT INTO file(filename, mtime) VALUES(?, ?)", (name, mtime))
47
+ ((rowid,),) = self.db.execute("SELECT rowid FROM file WHERE filename = ?", (name,))
48
+ self.db.execute("INSERT INTO text(rowid, body) VALUES(?, ?)", (rowid, body))
49
+ else:
50
+ self.db.execute("UPDATE file SET mtime = ? WHERE rowid = ?", (mtime, rowid))
51
+ self.db.execute("UPDATE text SET body = ? WHERE rowid = ?", (body, rowid))
52
+ return rowid
53
+
54
+ def search(self, term):
55
+ for row in self.searchiter(term):
56
+ print("=", row["filename"], "=" * 30)
57
+ print(row[1])
58
+ print()
59
+
60
+ def searchiter(self, term):
61
+ for row in self.db.execute(
62
+ """
63
+ SELECT filename, snippet(text, 0, '[', ']', '...', 10)
64
+ FROM text JOIN file ON text.rowid = file.rowid
65
+ WHERE body MATCH ? ORDER BY rank
66
+ """,
67
+ (term,)
68
+ ):
69
+ yield row
70
+
71
+ def open(self, dbpath):
72
+ self.db = sqlite3.connect(dbpath, autocommit=False)
73
+ self.db.row_factory = sqlite3.Row
74
+
75
+ def initdb(self):
76
+ self.db.execute(
77
+ "CREATE TABLE IF NOT EXISTS file(filename TEXT, mtime INTEGER)"
78
+ )
79
+ self.db.execute(
80
+ "CREATE VIRTUAL TABLE IF NOT EXISTS text USING fts5(body)"
81
+ )
82
+
83
+
84
+ def main():
85
+ parser = argparse.ArgumentParser()
86
+
87
+ parser.add_argument("--database", type=Path, default="index.sqlite")
88
+ parser.add_argument("--docroot", type=Path, default=Path.cwd())
89
+
90
+ subs = parser.add_subparsers(dest="subcommand", required=True)
91
+
92
+ sub = subs.add_parser("index")
93
+ sub.add_argument("--prune", action="store_true")
94
+
95
+ sub = subs.add_parser("search")
96
+ sub.add_argument("term")
97
+
98
+ args = parser.parse_args()
99
+
100
+ db = Db(args.docroot)
101
+ db.open(args.database)
102
+ db.initdb()
103
+
104
+ if args.subcommand == "search":
105
+ db.search(args.term)
106
+ elif args.subcommand == "index":
107
+ db.index()
108
+ else:
109
+ raise NotImplementedError()
110
+
111
+
112
+ if __name__ == "__main__":
113
+ main()
@@ -0,0 +1,55 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "dumbwebsearch"
7
+ dynamic = ["version"]
8
+ description = "dumb web interface to search/browse indexed text files"
9
+ readme = "README.md"
10
+ license = "WTFPL"
11
+ requires-python = ">=3"
12
+ authors = [
13
+ { name = "Hg", email = "dev@indigo.re" },
14
+ ]
15
+ keywords = [
16
+ "server",
17
+ "search",
18
+ "web",
19
+ ]
20
+ classifiers = [
21
+ "Development Status :: 4 - Beta",
22
+ "Environment :: Web Environment",
23
+ "Framework :: Bottle",
24
+ "Intended Audience :: End Users/Desktop",
25
+ "License :: CC0 1.0 Universal (CC0 1.0) Public Domain Dedication",
26
+ "License :: Public Domain",
27
+ "Programming Language :: Python :: 3",
28
+ "Programming Language :: Python :: 3.8",
29
+ "Programming Language :: Python :: 3.9",
30
+ "Programming Language :: Python :: 3.10",
31
+ "Programming Language :: Python :: 3.11",
32
+ "Programming Language :: Python :: 3.12",
33
+ "Programming Language :: Python :: 3.13",
34
+ "Topic :: Internet :: WWW/HTTP :: WSGI :: Application",
35
+ ]
36
+ dependencies = [
37
+ "bottle",
38
+ "Jinja2",
39
+ ]
40
+
41
+ [project.scripts]
42
+ dumbwebsearch-index = "dumbsearch.index:main"
43
+
44
+ [project.entry-points."wsgi"]
45
+ dumbwebsearch = "dumbwebsearch.browse:APP"
46
+
47
+ [project.urls]
48
+ Homepage = "https://gitlab.com/hydrargyrum/dumbsearch"
49
+
50
+ [tool.hatch.version]
51
+ path = "dumbwebsearch/__init__.py"
52
+
53
+ # requires flake8-pyproject
54
+ [tool.flake8]
55
+ max-line-length = 120
@@ -0,0 +1,2 @@
1
+ bottle
2
+ Jinja2
@@ -0,0 +1,21 @@
1
+
2
+ import dumbwebsearch.index
3
+
4
+ import subprocess
5
+
6
+
7
+ def run(args):
8
+ return subprocess.check_output([dumbwebsearch.index.__file__, *args], encoding="utf8")
9
+
10
+
11
+ def test_basic(tmp_path):
12
+ (tmp_path / "foo.md").write_text("some text")
13
+
14
+ run(["--database", str(tmp_path / "db"), "--docroot", str(tmp_path), "index"])
15
+
16
+ result = run(["--database", str(tmp_path / "db"), "--docroot", str(tmp_path), "search", "failure"])
17
+ assert result == ""
18
+
19
+ result = run(["--database", str(tmp_path / "db"), "--docroot", str(tmp_path), "search", "text"])
20
+ assert result.strip().split("\n") == ["= foo.md ==============================", "some [text]"]
21
+