PyPI - philologic - Versions diffs - 5.2.0.2__tar.gz → 5.2.2__tar.gz - Mend

philologic 5.2.0.2tar.gz → 5.2.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

{philologic-5.2.0.2 → philologic-5.2.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: philologic
-Version: 5.2.0.2
+Version: 5.2.2
 Summary: A concordance search engine for TEI-XML
 Author-email: Clovis Gladstone <clovisgladstone@artfl.uchicago.edu>
 License-Expression: GPL-3.0-or-later

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/Config.py RENAMED Viewed

@@ -164,7 +164,11 @@ DB_LOCALS_DEFAULTS = {
     "overflow_words": {
         "value": set(),
         "comment": "# The overflow_words variable is a set of words which are not indexed in the database, but stored as blobs in the data/overflow_words directory.",
-    }
+    },
+    "query_patterns": {
+        "value": None,
+        "comment": "# Custom query tokenization patterns. When set, overrides the default patterns in QuerySyntax.parse_query.\n# Must be a list of (label, regex) tuples, e.g. [(\"TERM\", r'[^\\s\"]+'), ...].\n# When None, the built-in default patterns are used.",
+    },
 }
 DB_LOCALS_HEADER = """
    #########################################################\n

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/DB.py RENAMED Viewed

@@ -269,7 +269,7 @@ class DB:
                     raw_bytes=raw_bytes,
                     ascii_conversion=self.locals.ascii_conversion,
                 )
-            parsed = QuerySyntax.parse_query(qs)
+            parsed = QuerySyntax.parse_query(qs, query_patterns=self.locals.query_patterns)
             grouped = QuerySyntax.group_terms(parsed)
             split = Query.split_terms(grouped)
             words_per_hit = len(split)

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/MetadataQuery.py RENAMED Viewed

@@ -200,7 +200,7 @@ def query_lowlevel(db, param_dict, sort_order, ascii_conversion):
         for v in values:
             parsed = "text"
             if db.locals.metadata_sql_types[column] in ("text", "int"):
-                parsed = parse_query(v)
+                parsed = parse_query(v, query_patterns=db.locals.query_patterns)
             elif db.locals.metadata_sql_types[column] == "date":
                 v = v.replace('"', "")  # remove quotes
                 parsed = parse_date_query(v)

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/Query.py RENAMED Viewed

@@ -7,24 +7,25 @@ import threading
 from bisect import bisect_left, bisect_right
 from pathlib import Path
+# Set Numba cache directory BEFORE importing numba — otherwise Numba resolves
+# its cache locator using the default (write next to source file), which fails
+# when the source is in a read-only site-packages directory.
+_cache_dir = os.environ.get("NUMBA_CACHE_DIR", "/var/lib/philologic5/numba_cache")
+if not os.access(_cache_dir, os.W_OK):
+    _cache_dir = f"/tmp/philologic_numba_cache_{os.getuid()}"
+    os.makedirs(_cache_dir, mode=0o755, exist_ok=True)
+os.environ["NUMBA_CACHE_DIR"] = _cache_dir
 import lmdb
 import numba
 import numpy as np
 import regex as re
+numba.config.CACHE_DIR = _cache_dir
 from philologic.runtime import HitList
 from philologic.runtime.QuerySyntax import group_terms, parse_query
-# Set Numba cache directory
-# Try shared cache first, fall back to /tmp if permission denied
-cache_dir = "/var/lib/philologic5/numba_cache"
-if not os.access(cache_dir, os.W_OK):
-    # In hardened containers, use per-user temp cache
-    cache_dir = f"/tmp/philologic_numba_cache_{os.getuid()}"
-    os.makedirs(cache_dir, mode=0o755, exist_ok=True)
-os.environ["NUMBA_CACHE_DIR"] = cache_dir
-numba.config.CACHE_DIR = cache_dir
 @numba.jit(nopython=True, cache=True, nogil=True)
 def _merge_two_sorted_arrays(arr1, arr2):
@@ -455,7 +456,7 @@ def query(
 ):
     """Runs concordance queries"""
     sys.stdout.flush()
-    parsed = parse_query(terms)
+    parsed = parse_query(terms, query_patterns=db.locals.query_patterns)
     grouped = group_terms(parsed)
     split = split_terms(grouped)
     words_per_hit = len(split)

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/QuerySyntax.py RENAMED Viewed

@@ -31,12 +31,14 @@ date_patterns = [
 ]
-def parse_query(qstring):
+def parse_query(qstring, query_patterns=None):
     """Parse query"""
+    if query_patterns is None:
+        query_patterns = patterns
     buf = qstring[:]
     parsed = []
     while len(buf) > 0:
-        for label, pattern in patterns:
+        for label, pattern in query_patterns:
             m = re.match(pattern, buf)
             if m:
                 parsed.append((label, m.group()))

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/__init__.py RENAMED Viewed

@@ -7,21 +7,27 @@ from philologic.runtime.get_text import get_concordance_text, get_tei_header
 from philologic.runtime.pages import page_interval
 from philologic.runtime.Query import parse_query
 from philologic.runtime.reports import (
+    aggregation_by_field,
+    aggregation_to_csv,
     bibliography_results,
+    bibliography_to_csv,
     collocation_results,
+    collocation_to_csv,
     concordance_results,
+    concordance_to_csv,
     frequency_results,
     generate_text_object,
     generate_time_series,
     generate_toc_object,
     generate_word_frequency,
     get_start_end_date,
-    kwic_hit_object,
-    kwic_results,
     group_by_metadata,
     group_by_range,
+    kwic_hit_object,
+    kwic_results,
+    kwic_to_csv,
     landing_page_bibliography,
-    aggregation_by_field,
+    time_series_to_csv,
 )
 from philologic.runtime.web_config import WebConfig
 from philologic.runtime.WSGIHandler import WSGIHandler

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/access_control.py RENAMED Viewed

@@ -309,15 +309,15 @@ def login_access(environ, request, config, headers):
                 token = make_token(db)
                 if token:
                     h, ts = token
-                    headers.append(("Set-Cookie", f"hash={h}"))
-                    headers.append(("Set-Cookie", f"timestamp={ts}"))
+                    headers.append(("Set-Cookie", f"hash={h}; Path=/"))
+                    headers.append(("Set-Cookie", f"timestamp={ts}; Path=/"))
         else:
             # WORKAROUND because cookie not being sent on access_request.py request
             token = check_access(environ, config)
             if token:
                 h, ts = token
-                headers.append(("Set-Cookie", f"hash={h}"))
-                headers.append(("Set-Cookie", f"timestamp={ts}"))
+                headers.append(("Set-Cookie", f"hash={h}; Path=/"))
+                headers.append(("Set-Cookie", f"timestamp={ts}; Path=/"))
                 access = True
             else:
                 access = False

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/reports/__init__.py RENAMED Viewed

@@ -1,13 +1,13 @@
 """Report exports"""
-from philologic.runtime.reports.concordance import concordance_results
-from philologic.runtime.reports.bibliography import bibliography_results
-from philologic.runtime.reports.time_series import generate_time_series, get_start_end_date
+from philologic.runtime.reports.concordance import concordance_results, concordance_to_csv
+from philologic.runtime.reports.bibliography import bibliography_results, bibliography_to_csv
+from philologic.runtime.reports.time_series import generate_time_series, get_start_end_date, time_series_to_csv
 from philologic.runtime.reports.navigation import generate_text_object
 from philologic.runtime.reports.table_of_contents import generate_toc_object
-from philologic.runtime.reports.kwic import kwic_results, kwic_hit_object
+from philologic.runtime.reports.kwic import kwic_results, kwic_hit_object, kwic_to_csv
 from philologic.runtime.reports.generate_word_frequency import generate_word_frequency
 from philologic.runtime.reports.frequency import frequency_results
-from philologic.runtime.reports.collocation import collocation_results
+from philologic.runtime.reports.collocation import collocation_results, collocation_to_csv
 from philologic.runtime.reports.landing_page import landing_page_bibliography, group_by_range, group_by_metadata
-from philologic.runtime.reports.aggregation import aggregation_by_field
+from philologic.runtime.reports.aggregation import aggregation_by_field, aggregation_to_csv

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/reports/aggregation.py RENAMED Viewed

@@ -1,6 +1,9 @@
 # /usr/bin/env python3
 """Report designed to group results by metadata with additional breakdown optional"""
+import csv
+import io
 import numpy as np
 from philologic.runtime.DB import DB
@@ -137,6 +140,45 @@ def aggregation_by_field(request, config):
     }
+def aggregation_to_csv(results, break_up_field_name=""):
+    """Convert aggregation results to CSV string.
+    Each breakdown entry gets its own row. Rows from the same group
+    are contiguous, with the group-level metadata repeated.
+    """
+    if not results:
+        return ""
+    output = io.StringIO()
+    first = results[0]
+    group_keys = sorted(k for k in first["metadata_fields"].keys() if k not in ("field_name", "philo_id"))
+    has_breakdown = break_up_field_name and any(r["break_up_field"] for r in results)
+    if has_breakdown:
+        # Collect all metadata keys from breakdown entries
+        breakdown_keys = set()
+        for result in results:
+            for sub in result["break_up_field"]:
+                breakdown_keys.update(k for k in sub["metadata_fields"].keys() if k not in ("field_name", "philo_id"))
+        breakdown_keys = sorted(breakdown_keys - set(group_keys))
+        fieldnames = group_keys + ["group_count"] + breakdown_keys + ["count"]
+    else:
+        fieldnames = group_keys + ["count"]
+    writer = csv.DictWriter(output, fieldnames=fieldnames)
+    writer.writeheader()
+    for result in results:
+        group_fields = {k: result["metadata_fields"].get(k, "") for k in group_keys}
+        if has_breakdown and result["break_up_field"]:
+            for sub in result["break_up_field"]:
+                row = {**group_fields, "group_count": result["count"]}
+                for k in breakdown_keys:
+                    row[k] = sub["metadata_fields"].get(k, "")
+                row["count"] = sub["count"]
+                writer.writerow(row)
+        else:
+            row = {**group_fields, "count": result["count"]}
+            writer.writerow(row)
+    return output.getvalue()
 def __expand_hits_counted(hits, metadata_type):
     """Stream sorted hitlist with numpy, return per-ID hit counts.

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/reports/bibliography.py RENAMED Viewed

@@ -1,6 +1,8 @@
 #!/var/lib/philologic5/philologic_env/bin/python3
 """Bibliography results"""
+import csv
+import io
 from philologic.runtime.citations import citation_links, citations
 from philologic.runtime.DB import DB
@@ -77,3 +79,19 @@ def bibliography_results(request, config):
     bibliography_object["query_done"] = hits.done
     bibliography_object["result_type"] = result_type
     return bibliography_object, hits
+def bibliography_to_csv(results):
+    """Convert bibliography results to CSV string."""
+    if not results:
+        return ""
+    output = io.StringIO()
+    metadata_keys = sorted(results[0]["metadata_fields"].keys())
+    fieldnames = ["philo_id"] + metadata_keys
+    writer = csv.DictWriter(output, fieldnames=fieldnames)
+    writer.writeheader()
+    for result in results:
+        row = {"philo_id": " ".join(str(x) for x in result["philo_id"])}
+        row.update(result["metadata_fields"])
+        writer.writerow(row)
+    return output.getvalue()

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/reports/collocation.py RENAMED Viewed

@@ -580,6 +580,21 @@ def collocation_results(request, config):
     return collocation_object
+def collocation_to_csv(collocates):
+    """Convert collocation results (list of (word, count) tuples) to CSV string."""
+    import csv
+    import io
+    if not collocates:
+        return ""
+    output = io.StringIO()
+    writer = csv.DictWriter(output, fieldnames=["collocate", "count"])
+    writer.writeheader()
+    for word, count in collocates:
+        writer.writerow({"collocate": word, "count": count})
+    return output.getvalue()
 def atomic_pickle_dump(data, file_path):
     """Write pickle atomically to prevent truncated reads from concurrent requests."""
     dir_path = os.path.dirname(file_path)

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/reports/concordance.py RENAMED Viewed

@@ -1,6 +1,9 @@
 #!/var/lib/philologic5/philologic_env/bin/python3
 """Concordance report"""
+import csv
+import io
 import regex as re
 from philologic.runtime.citations import citation_links, citations
 from philologic.runtime.DB import DB
@@ -60,3 +63,23 @@ def concordance_results(request, config):
     concordance_object["results_length"] = len(hits)
     concordance_object["query_done"] = hits.done
     return concordance_object
+def concordance_to_csv(results, filter_html=False):
+    """Convert concordance results to CSV string."""
+    if not results:
+        return ""
+    tags_re = re.compile(r"<[^>]+>")
+    output = io.StringIO()
+    metadata_keys = sorted(results[0]["metadata_fields"].keys())
+    fieldnames = ["philo_id", "context"] + metadata_keys
+    writer = csv.DictWriter(output, fieldnames=fieldnames)
+    writer.writeheader()
+    for result in results:
+        context = result["context"]
+        if filter_html:
+            context = tags_re.sub("", context).strip()
+        row = {"philo_id": " ".join(str(x) for x in result["philo_id"]), "context": context}
+        row.update(result["metadata_fields"])
+        writer.writerow(row)
+    return output.getvalue()

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/reports/kwic.py RENAMED Viewed

@@ -1,6 +1,8 @@
 #!/var/lib/philologic5/philologic_env/bin/python3
 """KWIC results"""
+import csv
+import io
 import regex as re
 from philologic.runtime.citations import citation_links, citations
@@ -92,3 +94,23 @@ def kwic_hit_object(hit, config, db):
     }
     return kwic_result
+def kwic_to_csv(results, filter_html=False):
+    """Convert KWIC results to CSV string."""
+    if not results:
+        return ""
+    tags_re = re.compile(r"<[^>]+>")
+    output = io.StringIO()
+    metadata_keys = sorted(results[0]["metadata_fields"].keys())
+    fieldnames = ["philo_id", "context"] + metadata_keys
+    writer = csv.DictWriter(output, fieldnames=fieldnames)
+    writer.writeheader()
+    for result in results:
+        context = result["context"]
+        if filter_html:
+            context = tags_re.sub("", context).strip()
+        row = {"philo_id": " ".join(str(x) for x in result["philo_id"]), "context": context}
+        row.update(result["metadata_fields"])
+        writer.writerow(row)
+    return output.getvalue()

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/reports/time_series.py RENAMED Viewed

@@ -2,7 +2,6 @@
 """Time series"""
 import os
-import time
 import numba
 import numpy as np
@@ -96,7 +95,6 @@ def _bucket_hits_by_year(doc_ids, year_array, start_date, interval, n_ranges):
 def generate_time_series(request, config):
-    t0 = time.time()
     db = DB(config.db_path + "/data/")
     year_field = validate_column(config.time_series_year_field, db)
     time_series_object = {"query": dict([i for i in request]), "query_done": False}
@@ -109,25 +107,24 @@ def generate_time_series(request, config):
         time_series_object["results"] = {"absolute_count": {}, "date_count": {}}
         return time_series_object
-    interval = int(request.year_interval)
+    try:
+        interval = int(request.year_interval)
+    except (ValueError, TypeError):
+        interval = int(config.time_series_interval)
     # Get cached doc→year mapping (SQL only on first request per worker)
-    t1 = time.time()
     year_array, year_word_counts, year_doc_counts, min_date, max_date = _get_doc_year_data(db, year_field)
-    print(f"[time_series] doc year data: {time.time()-t1:.3f}s", flush=True)
     # Resolve start/end dates
     start_date = int(request.start_date) if request.start_date else min_date
     end_date = int(request.end_date) if request.end_date else max_date
     # Fire the word query now that we have start/end dates
-    t1 = time.time()
     hits = None
     if request.q:
         metadata = dict(request.metadata)
         metadata[year_field] = "%d-%d" % (start_date, end_date)
         hits = db.query(request["q"], request["method"], request["arg"], raw_results=True, **metadata)
-    print(f"[time_series] db.query dispatch: {time.time()-t1:.3f}s", flush=True)
     # Generate date ranges for output
     date_ranges = []
@@ -150,27 +147,18 @@ def generate_time_series(request, config):
     # Absolute hit counts: wait for search, then vectorized bucketing
     if hits is not None:
-        t1 = time.time()
         hits.finish()
-        t_finish = time.time() - t1
         total_hits = len(hits)
-        print(f"[time_series] hits.finish() wait ({total_hits} hits): {t_finish:.3f}s", flush=True)
         if total_hits > 0:
-            t1 = time.time()
             hit_length = hits.length
             mm = np.memmap(hits.filename, dtype="u4", mode="r").reshape(-1, hit_length)
             doc_ids = np.ascontiguousarray(mm[:, 0])
             del mm  # release mmap immediately
-            t_read = time.time() - t1
-            # Single-pass JIT on contiguous doc_id column
-            t1 = time.time()
             bin_counts, total_hits = _bucket_hits_by_year(
                 doc_ids, year_array, start_date, interval, n_ranges
             )
-            t_jit = time.time() - t1
-            print(f"[time_series] mmap+extract doc_ids: {t_read:.3f}s, JIT bucket: {t_jit:.3f}s ({total_hits} hits in {n_ranges} bins)", flush=True)
         else:
             bin_counts = np.zeros(n_ranges, dtype=np.int64)
     else:
@@ -182,7 +170,6 @@ def generate_time_series(request, config):
             total_hits += int(bin_counts[i])
     # Build absolute_count output matching expected format
-    t1 = time.time()
     absolute_count = {}
     for i, (range_start, date_range) in enumerate(date_ranges):
         params = {"report": "concordance", "start": "0", "end": "0"}
@@ -193,7 +180,6 @@ def generate_time_series(request, config):
             "count": int(bin_counts[i]),
             "url": url,
         }
-    print(f"[time_series] build output ({n_ranges} ranges): {time.time()-t1:.3f}s", flush=True)
     time_series_object["results_length"] = int(total_hits)
     time_series_object["more_results"] = False
@@ -202,10 +188,31 @@ def generate_time_series(request, config):
         "date_count": {str(date): count for date, count in date_counts.items()},
     }
-    print(f"[time_series] TOTAL: {time.time()-t0:.3f}s", flush=True)
     return time_series_object
+def time_series_to_csv(results):
+    """Convert time series results to CSV string."""
+    import csv
+    import io
+    absolute_count = results.get("absolute_count", {})
+    date_count = results.get("date_count", {})
+    if not absolute_count:
+        return ""
+    output = io.StringIO()
+    writer = csv.DictWriter(output, fieldnames=["period", "count", "total_words"])
+    writer.writeheader()
+    for period_start in sorted(absolute_count.keys(), key=int):
+        entry = absolute_count[period_start]
+        writer.writerow({
+            "period": entry["label"],
+            "count": entry["count"],
+            "total_words": date_count.get(period_start, ""),
+        })
+    return output.getvalue()
 def get_start_end_date(db, config, start_date=None, end_date=None):
     """Get start and end date of dataset"""
     year_field = validate_column(config.time_series_year_field, db)

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/term_expansion.py RENAMED Viewed

@@ -12,45 +12,13 @@ import regex as re
 from unidecode import unidecode
-# Process-level cache: one LMDB env per lmdb_path, kept open for the
-# lifetime of the worker process (avoids repeated open/close overhead).
-_norm_lmdb_cache: dict[str, lmdb.Environment] = {}
-# db_paths for which word_forms.lmdb is absent (no lemma/attr flat files)
-_no_forms_lmdb: set[str] = set()
 # Flat files (in frequencies/) that feed word_forms.lmdb
 _FORMS_FLAT_FILES = ("lemmas", "word_attributes", "lemma_word_attributes")
-def get_lmdb_env(lmdb_path: str) -> lmdb.Environment:
-    """Return (and cache) a read-only LMDB environment for the given path."""
-    env = _norm_lmdb_cache.get(lmdb_path)
-    if env is not None:
-        return env
-    env = lmdb.open(lmdb_path, readonly=True, lock=False, readahead=False, max_spare_txns=4)
-    _norm_lmdb_cache[lmdb_path] = env
-    return env
-def _get_norm_env(freq_file: str) -> lmdb.Environment:
-    """Return (and cache) the norm_word.lmdb env (built at index time by PostFilters)."""
-    return get_lmdb_env(freq_file + ".lmdb")
-def _get_forms_env(db_path: str) -> lmdb.Environment | None:
-    """Return (and cache) the word_forms.lmdb env (built at index time by PostFilters).
-    Returns None if the database has no word_forms.lmdb (no lemma/attr data).
-    """
-    lmdb_path = os.path.join(db_path, "frequencies", "word_forms.lmdb")
-    if lmdb_path in _norm_lmdb_cache:
-        return _norm_lmdb_cache[lmdb_path]
-    if db_path in _no_forms_lmdb:
-        return None
-    if not os.path.exists(lmdb_path):
-        _no_forms_lmdb.add(db_path)
-        return None
-    return get_lmdb_env(lmdb_path)
+def _open_lmdb(lmdb_path: str) -> lmdb.Environment:
+    """Open a read-only LMDB environment. Caller should close it when done."""
+    return lmdb.open(lmdb_path, readonly=True, lock=False, readahead=False)
 def _norm_key(token: str, lowercase: bool = True) -> bytes:
@@ -258,9 +226,10 @@ def expand_query_not(split, freq_file, dest_fh, ascii_conversion, lowercase=True
     forms, and writes the result to dest_fh.
     Groups are separated by blank lines (consumed by get_word_groups()).
     """
-    env = _get_norm_env(freq_file)
+    env = _open_lmdb(freq_file + ".lmdb")
     db_path = os.path.normpath(os.path.join(os.path.dirname(freq_file), ".."))
-    forms_env = _get_forms_env(db_path)
+    forms_lmdb_path = os.path.join(db_path, "frequencies", "word_forms.lmdb")
+    forms_env = _open_lmdb(forms_lmdb_path) if os.path.exists(forms_lmdb_path) else None
     first = True
     with env.begin(buffers=True) as txn:
@@ -303,6 +272,9 @@ def expand_query_not(split, freq_file, dest_fh, ascii_conversion, lowercase=True
                         dest_fh.write(form + "\n")
                     except TypeError:
                         dest_fh.write((form + "\n").encode("utf-8"))
+    env.close()
+    if forms_env is not None:
+        forms_env.close()
 # ── Metadata inverted word index ──────────────────────────────────────────────
@@ -364,24 +336,22 @@ def build_metadata_word_index(db_path: str) -> int:
     return len(index)
-def _get_metadata_index_env(db_path: str) -> lmdb.Environment:
-    """Return (and cache) the metadata_word_index.lmdb env (built at index time by PostFilters)."""
-    lmdb_path = os.path.join(db_path, "frequencies", _META_LMDB_NAME)
-    return get_lmdb_env(lmdb_path)
 def metadata_word_lookup(db_path: str, field: str, term: str) -> list[str]:
     """Look up metadata values containing term as a whole word.
     Returns list of original metadata values from the inverted word index.
     """
-    env = _get_metadata_index_env(db_path)
-    key = f"{field}\x00{term}".encode("utf-8")
-    with env.begin(buffers=True) as txn:
-        val = txn.get(key)
-        if val is None:
-            return []
-        return bytes(val).decode("utf-8").split("\x00")
+    env = _open_lmdb(os.path.join(db_path, "frequencies", _META_LMDB_NAME))
+    try:
+        key = f"{field}\x00{term}".encode("utf-8")
+        with env.begin(buffers=True) as txn:
+            val = txn.get(key)
+            if val is None:
+                return []
+            return bytes(val).decode("utf-8").split("\x00")
+    finally:
+        env.close()
 def metadata_word_regex_scan(db_path: str, field: str, pattern: str) -> list[str]:
@@ -391,31 +361,34 @@ def metadata_word_regex_scan(db_path: str, field: str, pattern: str) -> list[str
     indexed word.  Returns deduplicated list of original metadata values
     from all matching words.
     """
-    env = _get_metadata_index_env(db_path)
-    field_prefix = f"{field}\x00".encode("utf-8")
-    compiled = re.compile(pattern)
-    seen: set[str] = set()
-    results: list[str] = []
-    with env.begin(buffers=True) as txn:
-        cursor = txn.cursor()
-        try:
-            if not cursor.set_range(field_prefix):
-                return results
-            while True:
-                k = bytes(cursor.key())
-                if not k.startswith(field_prefix):
-                    break
-                word = k[len(field_prefix):].decode("utf-8", errors="replace")
-                if compiled.search(word):
-                    for val in bytes(cursor.value()).decode("utf-8").split("\x00"):
-                        if val not in seen:
-                            seen.add(val)
-                            results.append(val)
-                if not cursor.next():
-                    break
-        finally:
-            cursor.close()
-    return results
+    env = _open_lmdb(os.path.join(db_path, "frequencies", _META_LMDB_NAME))
+    try:
+        field_prefix = f"{field}\x00".encode("utf-8")
+        compiled = re.compile(pattern)
+        seen: set[str] = set()
+        results: list[str] = []
+        with env.begin(buffers=True) as txn:
+            cursor = txn.cursor()
+            try:
+                if not cursor.set_range(field_prefix):
+                    return results
+                while True:
+                    k = bytes(cursor.key())
+                    if not k.startswith(field_prefix):
+                        break
+                    word = k[len(field_prefix):].decode("utf-8", errors="replace")
+                    if compiled.search(word):
+                        for val in bytes(cursor.value()).decode("utf-8").split("\x00"):
+                            if val not in seen:
+                                seen.add(val)
+                                results.append(val)
+                    if not cursor.next():
+                        break
+            finally:
+                cursor.close()
+        return results
+    finally:
+        env.close()
 def metadata_word_prefix_scan(db_path: str, field: str, prefix: str,
@@ -425,30 +398,33 @@ def metadata_word_prefix_scan(db_path: str, field: str, prefix: str,
     Returns deduplicated list of original metadata values from all matching words.
     Used for metadata autocomplete.
     """
-    env = _get_metadata_index_env(db_path)
-    key_prefix = f"{field}\x00{prefix}".encode("utf-8")
-    seen: set[str] = set()
-    results: list[str] = []
-    with env.begin(buffers=True) as txn:
-        cursor = txn.cursor()
-        try:
-            if not cursor.set_range(key_prefix):
-                return results
-            while True:
-                k = bytes(cursor.key())
-                if not k.startswith(key_prefix):
-                    break
-                for val in bytes(cursor.value()).decode("utf-8").split("\x00"):
-                    if val not in seen:
-                        seen.add(val)
-                        results.append(val)
-                        if len(results) >= max_results:
-                            return results
-                if not cursor.next():
-                    break
-        finally:
-            cursor.close()
-    return results
+    env = _open_lmdb(os.path.join(db_path, "frequencies", _META_LMDB_NAME))
+    try:
+        key_prefix = f"{field}\x00{prefix}".encode("utf-8")
+        seen: set[str] = set()
+        results: list[str] = []
+        with env.begin(buffers=True) as txn:
+            cursor = txn.cursor()
+            try:
+                if not cursor.set_range(key_prefix):
+                    return results
+                while True:
+                    k = bytes(cursor.key())
+                    if not k.startswith(key_prefix):
+                        break
+                    for val in bytes(cursor.value()).decode("utf-8").split("\x00"):
+                        if val not in seen:
+                            seen.add(val)
+                            results.append(val)
+                            if len(results) >= max_results:
+                                return results
+                    if not cursor.next():
+                        break
+            finally:
+                cursor.close()
+        return results
+    finally:
+        env.close()
 def expand_autocomplete(kind: str, token: str, frequency_file: str, db_path: str,
@@ -469,29 +445,36 @@ def expand_autocomplete(kind: str, token: str, frequency_file: str, db_path: str
         raw_token = token[1:-1] if kind == "QUOTE" else token
         if not raw_token:
             return []
-        env = _get_norm_env(frequency_file)
-        with env.begin(buffers=True) as txn:
-            if _is_regex_pattern(raw_token):
-                norm_prefix, pattern_str = _normalize_pattern(raw_token, lowercase and ascii_conversion)
-                return _lmdb_expand_term(txn, norm_prefix, pattern_str, max_results)
-            elif ascii_conversion:
-                norm_prefix = _norm_key(raw_token, lowercase)
-                return _lmdb_expand_term(txn, norm_prefix, None, max_results)
-            else:
-                # ascii_conversion=False: query token is the norm key as-is
-                norm_prefix = raw_token.lower().encode("utf-8") if lowercase else raw_token.encode("utf-8")
-                return _lmdb_expand_term(txn, norm_prefix, None, max_results)
+        env = _open_lmdb(frequency_file + ".lmdb")
+        try:
+            with env.begin(buffers=True) as txn:
+                if _is_regex_pattern(raw_token):
+                    norm_prefix, pattern_str = _normalize_pattern(raw_token, lowercase and ascii_conversion)
+                    return _lmdb_expand_term(txn, norm_prefix, pattern_str, max_results)
+                elif ascii_conversion:
+                    norm_prefix = _norm_key(raw_token, lowercase)
+                    return _lmdb_expand_term(txn, norm_prefix, None, max_results)
+                else:
+                    # ascii_conversion=False: query token is the norm key as-is
+                    norm_prefix = raw_token.lower().encode("utf-8") if lowercase else raw_token.encode("utf-8")
+                    return _lmdb_expand_term(txn, norm_prefix, None, max_results)
+        finally:
+            env.close()
     elif kind in ("LEMMA", "ATTR", "LEMMA_ATTR"):
         if not token:
             return []
-        scan_env = _get_forms_env(db_path) or get_lmdb_env(os.path.join(db_path, "words.lmdb"))
-        with scan_env.begin(buffers=True) as txn:
-            if _is_regex_pattern(token):
-                literal, meta = _split_literal_prefix(token)
-                prefix_bytes = literal.encode("utf-8")
-                return _lmdb_scan_keys(txn, prefix_bytes, literal + meta, max_results)
-            else:
-                return _lmdb_scan_keys(txn, token.encode("utf-8"), None, max_results)
+        forms_lmdb_path = os.path.join(db_path, "frequencies", "word_forms.lmdb")
+        scan_env = _open_lmdb(forms_lmdb_path) if os.path.exists(forms_lmdb_path) else _open_lmdb(os.path.join(db_path, "words.lmdb"))
+        try:
+            with scan_env.begin(buffers=True) as txn:
+                if _is_regex_pattern(token):
+                    literal, meta = _split_literal_prefix(token)
+                    prefix_bytes = literal.encode("utf-8")
+                    return _lmdb_scan_keys(txn, prefix_bytes, literal + meta, max_results)
+                else:
+                    return _lmdb_scan_keys(txn, token.encode("utf-8"), None, max_results)
+        finally:
+            scan_env.close()
     return []

philologic-5.2.2/philologic/utils/upgrade_gunicorn_conf.py ADDED Viewed

@@ -0,0 +1,138 @@
+"""Upgrade gunicorn.conf.py while preserving user customizations.
+Compares the old installed gunicorn.conf.py against the old shipped defaults
+(gunicorn.conf.defaults.py) to detect user customizations, then replaces
+the corresponding lines in the new version in place.
+Settings that the user never changed get the new defaults automatically.
+Settings the user explicitly changed are preserved at their original location.
+"""
+import ast
+import re
+# Settings that can be safely merged across upgrades.
+# Hooks, imports, and computed values are always taken from the new version.
+MERGEABLE_SETTINGS = {
+    "bind",
+    "workers",
+    "timeout",
+    "max_requests",
+    "max_requests_jitter",
+    "preload_app",
+    "proc_name",
+    "accesslog",
+    "errorlog",
+    "loglevel",
+    "capture_output",
+}
+def _load_conf_values(path):
+    """Extract simple top-level assignments from a Python config file using AST.
+    Only extracts assignments of literal values (strings, numbers, booleans, None)
+    for settings in MERGEABLE_SETTINGS.  Ignores function calls, imports, and
+    computed values — no code is executed.
+    """
+    with open(path) as f:
+        tree = ast.parse(f.read(), filename=path)
+    values = {}
+    for node in ast.iter_child_nodes(tree):
+        if not isinstance(node, ast.Assign):
+            continue
+        if len(node.targets) != 1 or not isinstance(node.targets[0], ast.Name):
+            continue
+        name = node.targets[0].id
+        if name not in MERGEABLE_SETTINGS:
+            continue
+        try:
+            values[name] = ast.literal_eval(node.value)
+        except (ValueError, TypeError):
+            # Not a literal (e.g. min(cpu_count(), 4)) — skip, can't merge
+            pass
+    return values
+def _load_conf_names(path):
+    """Extract all top-level assignment names from a config file.
+    Unlike _load_conf_values, this returns names even for non-literal values
+    (e.g. min(cpu_count(), 4)), so we can detect which settings exist in the file.
+    """
+    with open(path) as f:
+        tree = ast.parse(f.read(), filename=path)
+    names = set()
+    for node in ast.iter_child_nodes(tree):
+        if not isinstance(node, ast.Assign):
+            continue
+        if len(node.targets) != 1 or not isinstance(node.targets[0], ast.Name):
+            continue
+        names.add(node.targets[0].id)
+    return names
+def _replace_setting_in_file(filepath, name, value):
+    """Replace a setting's value in-place in a Python config file.
+    Finds the line matching `name = ...` and replaces it with the new value.
+    """
+    with open(filepath) as f:
+        content = f.read()
+    # Match the assignment line: `name = <anything>` (not inside a comment)
+    pattern = re.compile(rf'^({re.escape(name)}\s*=\s*).*$', re.MULTILINE)
+    replacement = rf'\g<1>{value!r}'
+    new_content, count = pattern.subn(replacement, content)
+    if count > 0:
+        with open(filepath, "w") as f:
+            f.write(new_content)
+        return True
+    return False
+def upgrade_gunicorn_conf(old_conf, old_defaults, new_conf, new_defaults=None):
+    """Upgrade gunicorn.conf.py preserving user customizations.
+    The new conf and defaults files should already be in their final location
+    (e.g. copied by install.sh). This function reads the OLD backups to detect
+    customizations, then replaces the corresponding values in the new conf.
+    Args:
+        old_conf:      path to backup of the previously installed gunicorn.conf.py
+        old_defaults:  path to backup of the previously installed gunicorn.conf.defaults.py
+        new_conf:      path to the new gunicorn.conf.py (already in place)
+        new_defaults:  path to the new gunicorn.conf.defaults.py (already in place)
+    Returns:
+        List of setting names that were preserved from the old config.
+    """
+    # Load values from the old files
+    prev_defaults = _load_conf_values(old_defaults)
+    prev_conf = _load_conf_values(old_conf)
+    # Find settings the user explicitly changed from the original defaults
+    user_customizations = {}
+    for key in MERGEABLE_SETTINGS:
+        if key in prev_conf and key in prev_defaults:
+            if prev_conf[key] != prev_defaults[key]:
+                user_customizations[key] = prev_conf[key]
+        elif key in prev_conf and key not in prev_defaults:
+            # User added a setting that wasn't in the defaults (e.g. user/group)
+            user_customizations[key] = prev_conf[key]
+    # Only preserve customizations for settings that exist in the new defaults.
+    # Settings removed from the new defaults (e.g. worker_class, threads) are
+    # intentionally dropped — even if the user had customized them.
+    new_default_names = _load_conf_names(new_defaults) if new_defaults else set()
+    preserved = []
+    for key, value in user_customizations.items():
+        if new_default_names and key not in new_default_names:
+            continue
+        if _replace_setting_in_file(new_conf, key, value):
+            preserved.append(key)
+    return preserved

{philologic-5.2.0.2 → philologic-5.2.2}/philologic.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: philologic
-Version: 5.2.0.2
+Version: 5.2.2
 Summary: A concordance search engine for TEI-XML
 Author-email: Clovis Gladstone <clovisgladstone@artfl.uchicago.edu>
 License-Expression: GPL-3.0-or-later

{philologic-5.2.0.2 → philologic-5.2.2}/philologic.egg-info/SOURCES.txt RENAMED Viewed

@@ -58,4 +58,5 @@ philologic/utils/line_count.py
 philologic/utils/load_module.py
 philologic/utils/metadata_type_handler.py
 philologic/utils/pretty_print.py
-philologic/utils/sort.py
+philologic/utils/sort.py
+philologic/utils/upgrade_gunicorn_conf.py

{philologic-5.2.0.2 → philologic-5.2.2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "philologic"
-version = "5.2.0.2"
+version = "5.2.2"
 authors = [
     { name = "Clovis Gladstone", email = "clovisgladstone@artfl.uchicago.edu" },
 ]

{philologic-5.2.0.2 → philologic-5.2.2}/LICENSE RENAMED Viewed

File without changes

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/TagCensus.py RENAMED Viewed

File without changes

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/__init__.py RENAMED Viewed

File without changes

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/loadtime/LoadFilters.py RENAMED Viewed

File without changes

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/loadtime/LoadOptions.py RENAMED Viewed

File without changes

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/loadtime/Loader.py RENAMED Viewed

File without changes

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/loadtime/OHCOVector.py RENAMED Viewed

File without changes

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/loadtime/Parser.py RENAMED Viewed

File without changes

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/loadtime/PlainTextParser.py RENAMED Viewed

File without changes

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/loadtime/PostFilters.py RENAMED Viewed

File without changes

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/loadtime/__init__.py RENAMED Viewed

File without changes

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/loadtime/__main__.py RENAMED Viewed

File without changes

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/FragmentParser.py RENAMED Viewed

File without changes

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/HitList.py RENAMED Viewed

File without changes

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/HitWrapper.py RENAMED Viewed

File without changes

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/ObjectFormatter.py RENAMED Viewed

File without changes

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/WSGIHandler.py RENAMED Viewed

File without changes

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/citations.py RENAMED Viewed

File without changes

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/exceptions.py RENAMED Viewed

File without changes

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/find_similar_words.py RENAMED Viewed

File without changes

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/get_text.py RENAMED Viewed

File without changes

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/link.py RENAMED Viewed

File without changes

{philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/multi_word_search.py RENAMED Viewed

@@ -8,7 +8,6 @@ proxy, exact_cooc, sentence).
 import os
-import lmdb
 import numba
 import numpy as np
@@ -20,6 +19,8 @@ if not os.access(cache_dir, os.W_OK):
 os.environ["NUMBA_CACHE_DIR"] = cache_dir
 numba.config.CACHE_DIR = cache_dir
+import lmdb
 from philologic.runtime.Query import (
     _find_doc_boundaries,
     _load_word_arrays,
@@ -901,7 +902,6 @@ def search_phrase(db_path, hitlist_filename, overflow_words, corpus_file=None):
                         if not flushed:
                             output_file.flush()
                             flushed = True
     env.close()
@@ -1208,5 +1208,5 @@ def _search_two_groups_batched(db_path, hitlist_filename, word_groups, overflow_
                             if not flushed:
                                 output_file.flush()
                                 flushed = True
     env.close()