philologic 5.2.0.2__tar.gz → 5.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {philologic-5.2.0.2 → philologic-5.2.2}/PKG-INFO +1 -1
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/Config.py +5 -1
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/DB.py +1 -1
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/MetadataQuery.py +1 -1
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/Query.py +12 -11
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/QuerySyntax.py +4 -2
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/__init__.py +9 -3
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/access_control.py +4 -4
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/reports/__init__.py +6 -6
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/reports/aggregation.py +42 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/reports/bibliography.py +18 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/reports/collocation.py +15 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/reports/concordance.py +23 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/reports/kwic.py +22 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/reports/time_series.py +26 -19
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/term_expansion.py +101 -118
- philologic-5.2.2/philologic/utils/upgrade_gunicorn_conf.py +138 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic.egg-info/PKG-INFO +1 -1
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic.egg-info/SOURCES.txt +2 -1
- {philologic-5.2.0.2 → philologic-5.2.2}/pyproject.toml +1 -1
- {philologic-5.2.0.2 → philologic-5.2.2}/LICENSE +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/TagCensus.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/__init__.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/loadtime/LoadFilters.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/loadtime/LoadOptions.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/loadtime/Loader.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/loadtime/OHCOVector.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/loadtime/Parser.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/loadtime/PlainTextParser.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/loadtime/PostFilters.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/loadtime/__init__.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/loadtime/__main__.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/FragmentParser.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/HitList.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/HitWrapper.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/ObjectFormatter.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/WSGIHandler.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/citations.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/exceptions.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/find_similar_words.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/get_text.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/link.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/multi_word_search.py +3 -3
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/pages.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/reports/frequency.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/reports/generate_word_frequency.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/reports/landing_page.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/reports/navigation.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/reports/table_of_contents.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/sql_validation.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/web_config.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/shlax.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/shlaxtree.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/utils/__init__.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/utils/convert_entities.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/utils/line_count.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/utils/load_module.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/utils/metadata_type_handler.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/utils/pretty_print.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic/utils/sort.py +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic.egg-info/dependency_links.txt +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic.egg-info/requires.txt +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/philologic.egg-info/top_level.txt +0 -0
- {philologic-5.2.0.2 → philologic-5.2.2}/setup.cfg +0 -0
|
@@ -164,7 +164,11 @@ DB_LOCALS_DEFAULTS = {
|
|
|
164
164
|
"overflow_words": {
|
|
165
165
|
"value": set(),
|
|
166
166
|
"comment": "# The overflow_words variable is a set of words which are not indexed in the database, but stored as blobs in the data/overflow_words directory.",
|
|
167
|
-
}
|
|
167
|
+
},
|
|
168
|
+
"query_patterns": {
|
|
169
|
+
"value": None,
|
|
170
|
+
"comment": "# Custom query tokenization patterns. When set, overrides the default patterns in QuerySyntax.parse_query.\n# Must be a list of (label, regex) tuples, e.g. [(\"TERM\", r'[^\\s\"]+'), ...].\n# When None, the built-in default patterns are used.",
|
|
171
|
+
},
|
|
168
172
|
}
|
|
169
173
|
DB_LOCALS_HEADER = """
|
|
170
174
|
#########################################################\n
|
|
@@ -269,7 +269,7 @@ class DB:
|
|
|
269
269
|
raw_bytes=raw_bytes,
|
|
270
270
|
ascii_conversion=self.locals.ascii_conversion,
|
|
271
271
|
)
|
|
272
|
-
parsed = QuerySyntax.parse_query(qs)
|
|
272
|
+
parsed = QuerySyntax.parse_query(qs, query_patterns=self.locals.query_patterns)
|
|
273
273
|
grouped = QuerySyntax.group_terms(parsed)
|
|
274
274
|
split = Query.split_terms(grouped)
|
|
275
275
|
words_per_hit = len(split)
|
|
@@ -200,7 +200,7 @@ def query_lowlevel(db, param_dict, sort_order, ascii_conversion):
|
|
|
200
200
|
for v in values:
|
|
201
201
|
parsed = "text"
|
|
202
202
|
if db.locals.metadata_sql_types[column] in ("text", "int"):
|
|
203
|
-
parsed = parse_query(v)
|
|
203
|
+
parsed = parse_query(v, query_patterns=db.locals.query_patterns)
|
|
204
204
|
elif db.locals.metadata_sql_types[column] == "date":
|
|
205
205
|
v = v.replace('"', "") # remove quotes
|
|
206
206
|
parsed = parse_date_query(v)
|
|
@@ -7,24 +7,25 @@ import threading
|
|
|
7
7
|
from bisect import bisect_left, bisect_right
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
|
|
10
|
+
# Set Numba cache directory BEFORE importing numba — otherwise Numba resolves
|
|
11
|
+
# its cache locator using the default (write next to source file), which fails
|
|
12
|
+
# when the source is in a read-only site-packages directory.
|
|
13
|
+
_cache_dir = os.environ.get("NUMBA_CACHE_DIR", "/var/lib/philologic5/numba_cache")
|
|
14
|
+
if not os.access(_cache_dir, os.W_OK):
|
|
15
|
+
_cache_dir = f"/tmp/philologic_numba_cache_{os.getuid()}"
|
|
16
|
+
os.makedirs(_cache_dir, mode=0o755, exist_ok=True)
|
|
17
|
+
os.environ["NUMBA_CACHE_DIR"] = _cache_dir
|
|
18
|
+
|
|
10
19
|
import lmdb
|
|
11
20
|
import numba
|
|
12
21
|
import numpy as np
|
|
13
22
|
import regex as re
|
|
14
23
|
|
|
24
|
+
numba.config.CACHE_DIR = _cache_dir
|
|
25
|
+
|
|
15
26
|
from philologic.runtime import HitList
|
|
16
27
|
from philologic.runtime.QuerySyntax import group_terms, parse_query
|
|
17
28
|
|
|
18
|
-
# Set Numba cache directory
|
|
19
|
-
# Try shared cache first, fall back to /tmp if permission denied
|
|
20
|
-
cache_dir = "/var/lib/philologic5/numba_cache"
|
|
21
|
-
if not os.access(cache_dir, os.W_OK):
|
|
22
|
-
# In hardened containers, use per-user temp cache
|
|
23
|
-
cache_dir = f"/tmp/philologic_numba_cache_{os.getuid()}"
|
|
24
|
-
os.makedirs(cache_dir, mode=0o755, exist_ok=True)
|
|
25
|
-
os.environ["NUMBA_CACHE_DIR"] = cache_dir
|
|
26
|
-
numba.config.CACHE_DIR = cache_dir
|
|
27
|
-
|
|
28
29
|
|
|
29
30
|
@numba.jit(nopython=True, cache=True, nogil=True)
|
|
30
31
|
def _merge_two_sorted_arrays(arr1, arr2):
|
|
@@ -455,7 +456,7 @@ def query(
|
|
|
455
456
|
):
|
|
456
457
|
"""Runs concordance queries"""
|
|
457
458
|
sys.stdout.flush()
|
|
458
|
-
parsed = parse_query(terms)
|
|
459
|
+
parsed = parse_query(terms, query_patterns=db.locals.query_patterns)
|
|
459
460
|
grouped = group_terms(parsed)
|
|
460
461
|
split = split_terms(grouped)
|
|
461
462
|
words_per_hit = len(split)
|
|
@@ -31,12 +31,14 @@ date_patterns = [
|
|
|
31
31
|
]
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
def parse_query(qstring):
|
|
34
|
+
def parse_query(qstring, query_patterns=None):
|
|
35
35
|
"""Parse query"""
|
|
36
|
+
if query_patterns is None:
|
|
37
|
+
query_patterns = patterns
|
|
36
38
|
buf = qstring[:]
|
|
37
39
|
parsed = []
|
|
38
40
|
while len(buf) > 0:
|
|
39
|
-
for label, pattern in
|
|
41
|
+
for label, pattern in query_patterns:
|
|
40
42
|
m = re.match(pattern, buf)
|
|
41
43
|
if m:
|
|
42
44
|
parsed.append((label, m.group()))
|
|
@@ -7,21 +7,27 @@ from philologic.runtime.get_text import get_concordance_text, get_tei_header
|
|
|
7
7
|
from philologic.runtime.pages import page_interval
|
|
8
8
|
from philologic.runtime.Query import parse_query
|
|
9
9
|
from philologic.runtime.reports import (
|
|
10
|
+
aggregation_by_field,
|
|
11
|
+
aggregation_to_csv,
|
|
10
12
|
bibliography_results,
|
|
13
|
+
bibliography_to_csv,
|
|
11
14
|
collocation_results,
|
|
15
|
+
collocation_to_csv,
|
|
12
16
|
concordance_results,
|
|
17
|
+
concordance_to_csv,
|
|
13
18
|
frequency_results,
|
|
14
19
|
generate_text_object,
|
|
15
20
|
generate_time_series,
|
|
16
21
|
generate_toc_object,
|
|
17
22
|
generate_word_frequency,
|
|
18
23
|
get_start_end_date,
|
|
19
|
-
kwic_hit_object,
|
|
20
|
-
kwic_results,
|
|
21
24
|
group_by_metadata,
|
|
22
25
|
group_by_range,
|
|
26
|
+
kwic_hit_object,
|
|
27
|
+
kwic_results,
|
|
28
|
+
kwic_to_csv,
|
|
23
29
|
landing_page_bibliography,
|
|
24
|
-
|
|
30
|
+
time_series_to_csv,
|
|
25
31
|
)
|
|
26
32
|
from philologic.runtime.web_config import WebConfig
|
|
27
33
|
from philologic.runtime.WSGIHandler import WSGIHandler
|
|
@@ -309,15 +309,15 @@ def login_access(environ, request, config, headers):
|
|
|
309
309
|
token = make_token(db)
|
|
310
310
|
if token:
|
|
311
311
|
h, ts = token
|
|
312
|
-
headers.append(("Set-Cookie", f"hash={h}"))
|
|
313
|
-
headers.append(("Set-Cookie", f"timestamp={ts}"))
|
|
312
|
+
headers.append(("Set-Cookie", f"hash={h}; Path=/"))
|
|
313
|
+
headers.append(("Set-Cookie", f"timestamp={ts}; Path=/"))
|
|
314
314
|
else:
|
|
315
315
|
# WORKAROUND because cookie not being sent on access_request.py request
|
|
316
316
|
token = check_access(environ, config)
|
|
317
317
|
if token:
|
|
318
318
|
h, ts = token
|
|
319
|
-
headers.append(("Set-Cookie", f"hash={h}"))
|
|
320
|
-
headers.append(("Set-Cookie", f"timestamp={ts}"))
|
|
319
|
+
headers.append(("Set-Cookie", f"hash={h}; Path=/"))
|
|
320
|
+
headers.append(("Set-Cookie", f"timestamp={ts}; Path=/"))
|
|
321
321
|
access = True
|
|
322
322
|
else:
|
|
323
323
|
access = False
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
"""Report exports"""
|
|
2
2
|
|
|
3
|
-
from philologic.runtime.reports.concordance import concordance_results
|
|
4
|
-
from philologic.runtime.reports.bibliography import bibliography_results
|
|
5
|
-
from philologic.runtime.reports.time_series import generate_time_series, get_start_end_date
|
|
3
|
+
from philologic.runtime.reports.concordance import concordance_results, concordance_to_csv
|
|
4
|
+
from philologic.runtime.reports.bibliography import bibliography_results, bibliography_to_csv
|
|
5
|
+
from philologic.runtime.reports.time_series import generate_time_series, get_start_end_date, time_series_to_csv
|
|
6
6
|
from philologic.runtime.reports.navigation import generate_text_object
|
|
7
7
|
from philologic.runtime.reports.table_of_contents import generate_toc_object
|
|
8
|
-
from philologic.runtime.reports.kwic import kwic_results, kwic_hit_object
|
|
8
|
+
from philologic.runtime.reports.kwic import kwic_results, kwic_hit_object, kwic_to_csv
|
|
9
9
|
from philologic.runtime.reports.generate_word_frequency import generate_word_frequency
|
|
10
10
|
from philologic.runtime.reports.frequency import frequency_results
|
|
11
|
-
from philologic.runtime.reports.collocation import collocation_results
|
|
11
|
+
from philologic.runtime.reports.collocation import collocation_results, collocation_to_csv
|
|
12
12
|
from philologic.runtime.reports.landing_page import landing_page_bibliography, group_by_range, group_by_metadata
|
|
13
|
-
from philologic.runtime.reports.aggregation import aggregation_by_field
|
|
13
|
+
from philologic.runtime.reports.aggregation import aggregation_by_field, aggregation_to_csv
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
# /usr/bin/env python3
|
|
2
2
|
"""Report designed to group results by metadata with additional breakdown optional"""
|
|
3
3
|
|
|
4
|
+
import csv
|
|
5
|
+
import io
|
|
6
|
+
|
|
4
7
|
import numpy as np
|
|
5
8
|
|
|
6
9
|
from philologic.runtime.DB import DB
|
|
@@ -137,6 +140,45 @@ def aggregation_by_field(request, config):
|
|
|
137
140
|
}
|
|
138
141
|
|
|
139
142
|
|
|
143
|
+
def aggregation_to_csv(results, break_up_field_name=""):
|
|
144
|
+
"""Convert aggregation results to CSV string.
|
|
145
|
+
|
|
146
|
+
Each breakdown entry gets its own row. Rows from the same group
|
|
147
|
+
are contiguous, with the group-level metadata repeated.
|
|
148
|
+
"""
|
|
149
|
+
if not results:
|
|
150
|
+
return ""
|
|
151
|
+
output = io.StringIO()
|
|
152
|
+
first = results[0]
|
|
153
|
+
group_keys = sorted(k for k in first["metadata_fields"].keys() if k not in ("field_name", "philo_id"))
|
|
154
|
+
has_breakdown = break_up_field_name and any(r["break_up_field"] for r in results)
|
|
155
|
+
if has_breakdown:
|
|
156
|
+
# Collect all metadata keys from breakdown entries
|
|
157
|
+
breakdown_keys = set()
|
|
158
|
+
for result in results:
|
|
159
|
+
for sub in result["break_up_field"]:
|
|
160
|
+
breakdown_keys.update(k for k in sub["metadata_fields"].keys() if k not in ("field_name", "philo_id"))
|
|
161
|
+
breakdown_keys = sorted(breakdown_keys - set(group_keys))
|
|
162
|
+
fieldnames = group_keys + ["group_count"] + breakdown_keys + ["count"]
|
|
163
|
+
else:
|
|
164
|
+
fieldnames = group_keys + ["count"]
|
|
165
|
+
writer = csv.DictWriter(output, fieldnames=fieldnames)
|
|
166
|
+
writer.writeheader()
|
|
167
|
+
for result in results:
|
|
168
|
+
group_fields = {k: result["metadata_fields"].get(k, "") for k in group_keys}
|
|
169
|
+
if has_breakdown and result["break_up_field"]:
|
|
170
|
+
for sub in result["break_up_field"]:
|
|
171
|
+
row = {**group_fields, "group_count": result["count"]}
|
|
172
|
+
for k in breakdown_keys:
|
|
173
|
+
row[k] = sub["metadata_fields"].get(k, "")
|
|
174
|
+
row["count"] = sub["count"]
|
|
175
|
+
writer.writerow(row)
|
|
176
|
+
else:
|
|
177
|
+
row = {**group_fields, "count": result["count"]}
|
|
178
|
+
writer.writerow(row)
|
|
179
|
+
return output.getvalue()
|
|
180
|
+
|
|
181
|
+
|
|
140
182
|
def __expand_hits_counted(hits, metadata_type):
|
|
141
183
|
"""Stream sorted hitlist with numpy, return per-ID hit counts.
|
|
142
184
|
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
#!/var/lib/philologic5/philologic_env/bin/python3
|
|
2
2
|
"""Bibliography results"""
|
|
3
3
|
|
|
4
|
+
import csv
|
|
5
|
+
import io
|
|
4
6
|
|
|
5
7
|
from philologic.runtime.citations import citation_links, citations
|
|
6
8
|
from philologic.runtime.DB import DB
|
|
@@ -77,3 +79,19 @@ def bibliography_results(request, config):
|
|
|
77
79
|
bibliography_object["query_done"] = hits.done
|
|
78
80
|
bibliography_object["result_type"] = result_type
|
|
79
81
|
return bibliography_object, hits
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def bibliography_to_csv(results):
|
|
85
|
+
"""Convert bibliography results to CSV string."""
|
|
86
|
+
if not results:
|
|
87
|
+
return ""
|
|
88
|
+
output = io.StringIO()
|
|
89
|
+
metadata_keys = sorted(results[0]["metadata_fields"].keys())
|
|
90
|
+
fieldnames = ["philo_id"] + metadata_keys
|
|
91
|
+
writer = csv.DictWriter(output, fieldnames=fieldnames)
|
|
92
|
+
writer.writeheader()
|
|
93
|
+
for result in results:
|
|
94
|
+
row = {"philo_id": " ".join(str(x) for x in result["philo_id"])}
|
|
95
|
+
row.update(result["metadata_fields"])
|
|
96
|
+
writer.writerow(row)
|
|
97
|
+
return output.getvalue()
|
|
@@ -580,6 +580,21 @@ def collocation_results(request, config):
|
|
|
580
580
|
return collocation_object
|
|
581
581
|
|
|
582
582
|
|
|
583
|
+
def collocation_to_csv(collocates):
|
|
584
|
+
"""Convert collocation results (list of (word, count) tuples) to CSV string."""
|
|
585
|
+
import csv
|
|
586
|
+
import io
|
|
587
|
+
|
|
588
|
+
if not collocates:
|
|
589
|
+
return ""
|
|
590
|
+
output = io.StringIO()
|
|
591
|
+
writer = csv.DictWriter(output, fieldnames=["collocate", "count"])
|
|
592
|
+
writer.writeheader()
|
|
593
|
+
for word, count in collocates:
|
|
594
|
+
writer.writerow({"collocate": word, "count": count})
|
|
595
|
+
return output.getvalue()
|
|
596
|
+
|
|
597
|
+
|
|
583
598
|
def atomic_pickle_dump(data, file_path):
|
|
584
599
|
"""Write pickle atomically to prevent truncated reads from concurrent requests."""
|
|
585
600
|
dir_path = os.path.dirname(file_path)
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
#!/var/lib/philologic5/philologic_env/bin/python3
|
|
2
2
|
"""Concordance report"""
|
|
3
3
|
|
|
4
|
+
import csv
|
|
5
|
+
import io
|
|
6
|
+
|
|
4
7
|
import regex as re
|
|
5
8
|
from philologic.runtime.citations import citation_links, citations
|
|
6
9
|
from philologic.runtime.DB import DB
|
|
@@ -60,3 +63,23 @@ def concordance_results(request, config):
|
|
|
60
63
|
concordance_object["results_length"] = len(hits)
|
|
61
64
|
concordance_object["query_done"] = hits.done
|
|
62
65
|
return concordance_object
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def concordance_to_csv(results, filter_html=False):
|
|
69
|
+
"""Convert concordance results to CSV string."""
|
|
70
|
+
if not results:
|
|
71
|
+
return ""
|
|
72
|
+
tags_re = re.compile(r"<[^>]+>")
|
|
73
|
+
output = io.StringIO()
|
|
74
|
+
metadata_keys = sorted(results[0]["metadata_fields"].keys())
|
|
75
|
+
fieldnames = ["philo_id", "context"] + metadata_keys
|
|
76
|
+
writer = csv.DictWriter(output, fieldnames=fieldnames)
|
|
77
|
+
writer.writeheader()
|
|
78
|
+
for result in results:
|
|
79
|
+
context = result["context"]
|
|
80
|
+
if filter_html:
|
|
81
|
+
context = tags_re.sub("", context).strip()
|
|
82
|
+
row = {"philo_id": " ".join(str(x) for x in result["philo_id"]), "context": context}
|
|
83
|
+
row.update(result["metadata_fields"])
|
|
84
|
+
writer.writerow(row)
|
|
85
|
+
return output.getvalue()
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
#!/var/lib/philologic5/philologic_env/bin/python3
|
|
2
2
|
"""KWIC results"""
|
|
3
3
|
|
|
4
|
+
import csv
|
|
5
|
+
import io
|
|
4
6
|
|
|
5
7
|
import regex as re
|
|
6
8
|
from philologic.runtime.citations import citation_links, citations
|
|
@@ -92,3 +94,23 @@ def kwic_hit_object(hit, config, db):
|
|
|
92
94
|
}
|
|
93
95
|
|
|
94
96
|
return kwic_result
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def kwic_to_csv(results, filter_html=False):
|
|
100
|
+
"""Convert KWIC results to CSV string."""
|
|
101
|
+
if not results:
|
|
102
|
+
return ""
|
|
103
|
+
tags_re = re.compile(r"<[^>]+>")
|
|
104
|
+
output = io.StringIO()
|
|
105
|
+
metadata_keys = sorted(results[0]["metadata_fields"].keys())
|
|
106
|
+
fieldnames = ["philo_id", "context"] + metadata_keys
|
|
107
|
+
writer = csv.DictWriter(output, fieldnames=fieldnames)
|
|
108
|
+
writer.writeheader()
|
|
109
|
+
for result in results:
|
|
110
|
+
context = result["context"]
|
|
111
|
+
if filter_html:
|
|
112
|
+
context = tags_re.sub("", context).strip()
|
|
113
|
+
row = {"philo_id": " ".join(str(x) for x in result["philo_id"]), "context": context}
|
|
114
|
+
row.update(result["metadata_fields"])
|
|
115
|
+
writer.writerow(row)
|
|
116
|
+
return output.getvalue()
|
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
"""Time series"""
|
|
3
3
|
|
|
4
4
|
import os
|
|
5
|
-
import time
|
|
6
5
|
|
|
7
6
|
import numba
|
|
8
7
|
import numpy as np
|
|
@@ -96,7 +95,6 @@ def _bucket_hits_by_year(doc_ids, year_array, start_date, interval, n_ranges):
|
|
|
96
95
|
|
|
97
96
|
|
|
98
97
|
def generate_time_series(request, config):
|
|
99
|
-
t0 = time.time()
|
|
100
98
|
db = DB(config.db_path + "/data/")
|
|
101
99
|
year_field = validate_column(config.time_series_year_field, db)
|
|
102
100
|
time_series_object = {"query": dict([i for i in request]), "query_done": False}
|
|
@@ -109,25 +107,24 @@ def generate_time_series(request, config):
|
|
|
109
107
|
time_series_object["results"] = {"absolute_count": {}, "date_count": {}}
|
|
110
108
|
return time_series_object
|
|
111
109
|
|
|
112
|
-
|
|
110
|
+
try:
|
|
111
|
+
interval = int(request.year_interval)
|
|
112
|
+
except (ValueError, TypeError):
|
|
113
|
+
interval = int(config.time_series_interval)
|
|
113
114
|
|
|
114
115
|
# Get cached doc→year mapping (SQL only on first request per worker)
|
|
115
|
-
t1 = time.time()
|
|
116
116
|
year_array, year_word_counts, year_doc_counts, min_date, max_date = _get_doc_year_data(db, year_field)
|
|
117
|
-
print(f"[time_series] doc year data: {time.time()-t1:.3f}s", flush=True)
|
|
118
117
|
|
|
119
118
|
# Resolve start/end dates
|
|
120
119
|
start_date = int(request.start_date) if request.start_date else min_date
|
|
121
120
|
end_date = int(request.end_date) if request.end_date else max_date
|
|
122
121
|
|
|
123
122
|
# Fire the word query now that we have start/end dates
|
|
124
|
-
t1 = time.time()
|
|
125
123
|
hits = None
|
|
126
124
|
if request.q:
|
|
127
125
|
metadata = dict(request.metadata)
|
|
128
126
|
metadata[year_field] = "%d-%d" % (start_date, end_date)
|
|
129
127
|
hits = db.query(request["q"], request["method"], request["arg"], raw_results=True, **metadata)
|
|
130
|
-
print(f"[time_series] db.query dispatch: {time.time()-t1:.3f}s", flush=True)
|
|
131
128
|
|
|
132
129
|
# Generate date ranges for output
|
|
133
130
|
date_ranges = []
|
|
@@ -150,27 +147,18 @@ def generate_time_series(request, config):
|
|
|
150
147
|
|
|
151
148
|
# Absolute hit counts: wait for search, then vectorized bucketing
|
|
152
149
|
if hits is not None:
|
|
153
|
-
t1 = time.time()
|
|
154
150
|
hits.finish()
|
|
155
|
-
t_finish = time.time() - t1
|
|
156
151
|
total_hits = len(hits)
|
|
157
|
-
print(f"[time_series] hits.finish() wait ({total_hits} hits): {t_finish:.3f}s", flush=True)
|
|
158
152
|
|
|
159
153
|
if total_hits > 0:
|
|
160
|
-
t1 = time.time()
|
|
161
154
|
hit_length = hits.length
|
|
162
155
|
mm = np.memmap(hits.filename, dtype="u4", mode="r").reshape(-1, hit_length)
|
|
163
156
|
doc_ids = np.ascontiguousarray(mm[:, 0])
|
|
164
157
|
del mm # release mmap immediately
|
|
165
|
-
t_read = time.time() - t1
|
|
166
158
|
|
|
167
|
-
# Single-pass JIT on contiguous doc_id column
|
|
168
|
-
t1 = time.time()
|
|
169
159
|
bin_counts, total_hits = _bucket_hits_by_year(
|
|
170
160
|
doc_ids, year_array, start_date, interval, n_ranges
|
|
171
161
|
)
|
|
172
|
-
t_jit = time.time() - t1
|
|
173
|
-
print(f"[time_series] mmap+extract doc_ids: {t_read:.3f}s, JIT bucket: {t_jit:.3f}s ({total_hits} hits in {n_ranges} bins)", flush=True)
|
|
174
162
|
else:
|
|
175
163
|
bin_counts = np.zeros(n_ranges, dtype=np.int64)
|
|
176
164
|
else:
|
|
@@ -182,7 +170,6 @@ def generate_time_series(request, config):
|
|
|
182
170
|
total_hits += int(bin_counts[i])
|
|
183
171
|
|
|
184
172
|
# Build absolute_count output matching expected format
|
|
185
|
-
t1 = time.time()
|
|
186
173
|
absolute_count = {}
|
|
187
174
|
for i, (range_start, date_range) in enumerate(date_ranges):
|
|
188
175
|
params = {"report": "concordance", "start": "0", "end": "0"}
|
|
@@ -193,7 +180,6 @@ def generate_time_series(request, config):
|
|
|
193
180
|
"count": int(bin_counts[i]),
|
|
194
181
|
"url": url,
|
|
195
182
|
}
|
|
196
|
-
print(f"[time_series] build output ({n_ranges} ranges): {time.time()-t1:.3f}s", flush=True)
|
|
197
183
|
|
|
198
184
|
time_series_object["results_length"] = int(total_hits)
|
|
199
185
|
time_series_object["more_results"] = False
|
|
@@ -202,10 +188,31 @@ def generate_time_series(request, config):
|
|
|
202
188
|
"date_count": {str(date): count for date, count in date_counts.items()},
|
|
203
189
|
}
|
|
204
190
|
|
|
205
|
-
print(f"[time_series] TOTAL: {time.time()-t0:.3f}s", flush=True)
|
|
206
191
|
return time_series_object
|
|
207
192
|
|
|
208
193
|
|
|
194
|
+
def time_series_to_csv(results):
|
|
195
|
+
"""Convert time series results to CSV string."""
|
|
196
|
+
import csv
|
|
197
|
+
import io
|
|
198
|
+
|
|
199
|
+
absolute_count = results.get("absolute_count", {})
|
|
200
|
+
date_count = results.get("date_count", {})
|
|
201
|
+
if not absolute_count:
|
|
202
|
+
return ""
|
|
203
|
+
output = io.StringIO()
|
|
204
|
+
writer = csv.DictWriter(output, fieldnames=["period", "count", "total_words"])
|
|
205
|
+
writer.writeheader()
|
|
206
|
+
for period_start in sorted(absolute_count.keys(), key=int):
|
|
207
|
+
entry = absolute_count[period_start]
|
|
208
|
+
writer.writerow({
|
|
209
|
+
"period": entry["label"],
|
|
210
|
+
"count": entry["count"],
|
|
211
|
+
"total_words": date_count.get(period_start, ""),
|
|
212
|
+
})
|
|
213
|
+
return output.getvalue()
|
|
214
|
+
|
|
215
|
+
|
|
209
216
|
def get_start_end_date(db, config, start_date=None, end_date=None):
|
|
210
217
|
"""Get start and end date of dataset"""
|
|
211
218
|
year_field = validate_column(config.time_series_year_field, db)
|
|
@@ -12,45 +12,13 @@ import regex as re
|
|
|
12
12
|
from unidecode import unidecode
|
|
13
13
|
|
|
14
14
|
|
|
15
|
-
# Process-level cache: one LMDB env per lmdb_path, kept open for the
|
|
16
|
-
# lifetime of the worker process (avoids repeated open/close overhead).
|
|
17
|
-
_norm_lmdb_cache: dict[str, lmdb.Environment] = {}
|
|
18
|
-
# db_paths for which word_forms.lmdb is absent (no lemma/attr flat files)
|
|
19
|
-
_no_forms_lmdb: set[str] = set()
|
|
20
|
-
|
|
21
15
|
# Flat files (in frequencies/) that feed word_forms.lmdb
|
|
22
16
|
_FORMS_FLAT_FILES = ("lemmas", "word_attributes", "lemma_word_attributes")
|
|
23
17
|
|
|
24
18
|
|
|
25
|
-
def
|
|
26
|
-
"""
|
|
27
|
-
|
|
28
|
-
if env is not None:
|
|
29
|
-
return env
|
|
30
|
-
env = lmdb.open(lmdb_path, readonly=True, lock=False, readahead=False, max_spare_txns=4)
|
|
31
|
-
_norm_lmdb_cache[lmdb_path] = env
|
|
32
|
-
return env
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def _get_norm_env(freq_file: str) -> lmdb.Environment:
|
|
36
|
-
"""Return (and cache) the norm_word.lmdb env (built at index time by PostFilters)."""
|
|
37
|
-
return get_lmdb_env(freq_file + ".lmdb")
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
def _get_forms_env(db_path: str) -> lmdb.Environment | None:
|
|
41
|
-
"""Return (and cache) the word_forms.lmdb env (built at index time by PostFilters).
|
|
42
|
-
|
|
43
|
-
Returns None if the database has no word_forms.lmdb (no lemma/attr data).
|
|
44
|
-
"""
|
|
45
|
-
lmdb_path = os.path.join(db_path, "frequencies", "word_forms.lmdb")
|
|
46
|
-
if lmdb_path in _norm_lmdb_cache:
|
|
47
|
-
return _norm_lmdb_cache[lmdb_path]
|
|
48
|
-
if db_path in _no_forms_lmdb:
|
|
49
|
-
return None
|
|
50
|
-
if not os.path.exists(lmdb_path):
|
|
51
|
-
_no_forms_lmdb.add(db_path)
|
|
52
|
-
return None
|
|
53
|
-
return get_lmdb_env(lmdb_path)
|
|
19
|
+
def _open_lmdb(lmdb_path: str) -> lmdb.Environment:
|
|
20
|
+
"""Open a read-only LMDB environment. Caller should close it when done."""
|
|
21
|
+
return lmdb.open(lmdb_path, readonly=True, lock=False, readahead=False)
|
|
54
22
|
|
|
55
23
|
|
|
56
24
|
def _norm_key(token: str, lowercase: bool = True) -> bytes:
|
|
@@ -258,9 +226,10 @@ def expand_query_not(split, freq_file, dest_fh, ascii_conversion, lowercase=True
|
|
|
258
226
|
forms, and writes the result to dest_fh.
|
|
259
227
|
Groups are separated by blank lines (consumed by get_word_groups()).
|
|
260
228
|
"""
|
|
261
|
-
env =
|
|
229
|
+
env = _open_lmdb(freq_file + ".lmdb")
|
|
262
230
|
db_path = os.path.normpath(os.path.join(os.path.dirname(freq_file), ".."))
|
|
263
|
-
|
|
231
|
+
forms_lmdb_path = os.path.join(db_path, "frequencies", "word_forms.lmdb")
|
|
232
|
+
forms_env = _open_lmdb(forms_lmdb_path) if os.path.exists(forms_lmdb_path) else None
|
|
264
233
|
first = True
|
|
265
234
|
|
|
266
235
|
with env.begin(buffers=True) as txn:
|
|
@@ -303,6 +272,9 @@ def expand_query_not(split, freq_file, dest_fh, ascii_conversion, lowercase=True
|
|
|
303
272
|
dest_fh.write(form + "\n")
|
|
304
273
|
except TypeError:
|
|
305
274
|
dest_fh.write((form + "\n").encode("utf-8"))
|
|
275
|
+
env.close()
|
|
276
|
+
if forms_env is not None:
|
|
277
|
+
forms_env.close()
|
|
306
278
|
|
|
307
279
|
|
|
308
280
|
# ── Metadata inverted word index ──────────────────────────────────────────────
|
|
@@ -364,24 +336,22 @@ def build_metadata_word_index(db_path: str) -> int:
|
|
|
364
336
|
return len(index)
|
|
365
337
|
|
|
366
338
|
|
|
367
|
-
def _get_metadata_index_env(db_path: str) -> lmdb.Environment:
|
|
368
|
-
"""Return (and cache) the metadata_word_index.lmdb env (built at index time by PostFilters)."""
|
|
369
|
-
lmdb_path = os.path.join(db_path, "frequencies", _META_LMDB_NAME)
|
|
370
|
-
return get_lmdb_env(lmdb_path)
|
|
371
|
-
|
|
372
339
|
|
|
373
340
|
def metadata_word_lookup(db_path: str, field: str, term: str) -> list[str]:
|
|
374
341
|
"""Look up metadata values containing term as a whole word.
|
|
375
342
|
|
|
376
343
|
Returns list of original metadata values from the inverted word index.
|
|
377
344
|
"""
|
|
378
|
-
env =
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
345
|
+
env = _open_lmdb(os.path.join(db_path, "frequencies", _META_LMDB_NAME))
|
|
346
|
+
try:
|
|
347
|
+
key = f"{field}\x00{term}".encode("utf-8")
|
|
348
|
+
with env.begin(buffers=True) as txn:
|
|
349
|
+
val = txn.get(key)
|
|
350
|
+
if val is None:
|
|
351
|
+
return []
|
|
352
|
+
return bytes(val).decode("utf-8").split("\x00")
|
|
353
|
+
finally:
|
|
354
|
+
env.close()
|
|
385
355
|
|
|
386
356
|
|
|
387
357
|
def metadata_word_regex_scan(db_path: str, field: str, pattern: str) -> list[str]:
|
|
@@ -391,31 +361,34 @@ def metadata_word_regex_scan(db_path: str, field: str, pattern: str) -> list[str
|
|
|
391
361
|
indexed word. Returns deduplicated list of original metadata values
|
|
392
362
|
from all matching words.
|
|
393
363
|
"""
|
|
394
|
-
env =
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
seen
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
364
|
+
env = _open_lmdb(os.path.join(db_path, "frequencies", _META_LMDB_NAME))
|
|
365
|
+
try:
|
|
366
|
+
field_prefix = f"{field}\x00".encode("utf-8")
|
|
367
|
+
compiled = re.compile(pattern)
|
|
368
|
+
seen: set[str] = set()
|
|
369
|
+
results: list[str] = []
|
|
370
|
+
with env.begin(buffers=True) as txn:
|
|
371
|
+
cursor = txn.cursor()
|
|
372
|
+
try:
|
|
373
|
+
if not cursor.set_range(field_prefix):
|
|
374
|
+
return results
|
|
375
|
+
while True:
|
|
376
|
+
k = bytes(cursor.key())
|
|
377
|
+
if not k.startswith(field_prefix):
|
|
378
|
+
break
|
|
379
|
+
word = k[len(field_prefix):].decode("utf-8", errors="replace")
|
|
380
|
+
if compiled.search(word):
|
|
381
|
+
for val in bytes(cursor.value()).decode("utf-8").split("\x00"):
|
|
382
|
+
if val not in seen:
|
|
383
|
+
seen.add(val)
|
|
384
|
+
results.append(val)
|
|
385
|
+
if not cursor.next():
|
|
386
|
+
break
|
|
387
|
+
finally:
|
|
388
|
+
cursor.close()
|
|
389
|
+
return results
|
|
390
|
+
finally:
|
|
391
|
+
env.close()
|
|
419
392
|
|
|
420
393
|
|
|
421
394
|
def metadata_word_prefix_scan(db_path: str, field: str, prefix: str,
|
|
@@ -425,30 +398,33 @@ def metadata_word_prefix_scan(db_path: str, field: str, prefix: str,
|
|
|
425
398
|
Returns deduplicated list of original metadata values from all matching words.
|
|
426
399
|
Used for metadata autocomplete.
|
|
427
400
|
"""
|
|
428
|
-
env =
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
seen
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
401
|
+
env = _open_lmdb(os.path.join(db_path, "frequencies", _META_LMDB_NAME))
|
|
402
|
+
try:
|
|
403
|
+
key_prefix = f"{field}\x00{prefix}".encode("utf-8")
|
|
404
|
+
seen: set[str] = set()
|
|
405
|
+
results: list[str] = []
|
|
406
|
+
with env.begin(buffers=True) as txn:
|
|
407
|
+
cursor = txn.cursor()
|
|
408
|
+
try:
|
|
409
|
+
if not cursor.set_range(key_prefix):
|
|
410
|
+
return results
|
|
411
|
+
while True:
|
|
412
|
+
k = bytes(cursor.key())
|
|
413
|
+
if not k.startswith(key_prefix):
|
|
414
|
+
break
|
|
415
|
+
for val in bytes(cursor.value()).decode("utf-8").split("\x00"):
|
|
416
|
+
if val not in seen:
|
|
417
|
+
seen.add(val)
|
|
418
|
+
results.append(val)
|
|
419
|
+
if len(results) >= max_results:
|
|
420
|
+
return results
|
|
421
|
+
if not cursor.next():
|
|
422
|
+
break
|
|
423
|
+
finally:
|
|
424
|
+
cursor.close()
|
|
425
|
+
return results
|
|
426
|
+
finally:
|
|
427
|
+
env.close()
|
|
452
428
|
|
|
453
429
|
|
|
454
430
|
def expand_autocomplete(kind: str, token: str, frequency_file: str, db_path: str,
|
|
@@ -469,29 +445,36 @@ def expand_autocomplete(kind: str, token: str, frequency_file: str, db_path: str
|
|
|
469
445
|
raw_token = token[1:-1] if kind == "QUOTE" else token
|
|
470
446
|
if not raw_token:
|
|
471
447
|
return []
|
|
472
|
-
env =
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
448
|
+
env = _open_lmdb(frequency_file + ".lmdb")
|
|
449
|
+
try:
|
|
450
|
+
with env.begin(buffers=True) as txn:
|
|
451
|
+
if _is_regex_pattern(raw_token):
|
|
452
|
+
norm_prefix, pattern_str = _normalize_pattern(raw_token, lowercase and ascii_conversion)
|
|
453
|
+
return _lmdb_expand_term(txn, norm_prefix, pattern_str, max_results)
|
|
454
|
+
elif ascii_conversion:
|
|
455
|
+
norm_prefix = _norm_key(raw_token, lowercase)
|
|
456
|
+
return _lmdb_expand_term(txn, norm_prefix, None, max_results)
|
|
457
|
+
else:
|
|
458
|
+
# ascii_conversion=False: query token is the norm key as-is
|
|
459
|
+
norm_prefix = raw_token.lower().encode("utf-8") if lowercase else raw_token.encode("utf-8")
|
|
460
|
+
return _lmdb_expand_term(txn, norm_prefix, None, max_results)
|
|
461
|
+
finally:
|
|
462
|
+
env.close()
|
|
484
463
|
|
|
485
464
|
elif kind in ("LEMMA", "ATTR", "LEMMA_ATTR"):
|
|
486
465
|
if not token:
|
|
487
466
|
return []
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
467
|
+
forms_lmdb_path = os.path.join(db_path, "frequencies", "word_forms.lmdb")
|
|
468
|
+
scan_env = _open_lmdb(forms_lmdb_path) if os.path.exists(forms_lmdb_path) else _open_lmdb(os.path.join(db_path, "words.lmdb"))
|
|
469
|
+
try:
|
|
470
|
+
with scan_env.begin(buffers=True) as txn:
|
|
471
|
+
if _is_regex_pattern(token):
|
|
472
|
+
literal, meta = _split_literal_prefix(token)
|
|
473
|
+
prefix_bytes = literal.encode("utf-8")
|
|
474
|
+
return _lmdb_scan_keys(txn, prefix_bytes, literal + meta, max_results)
|
|
475
|
+
else:
|
|
476
|
+
return _lmdb_scan_keys(txn, token.encode("utf-8"), None, max_results)
|
|
477
|
+
finally:
|
|
478
|
+
scan_env.close()
|
|
496
479
|
|
|
497
480
|
return []
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""Upgrade gunicorn.conf.py while preserving user customizations.
|
|
2
|
+
|
|
3
|
+
Compares the old installed gunicorn.conf.py against the old shipped defaults
|
|
4
|
+
(gunicorn.conf.defaults.py) to detect user customizations, then replaces
|
|
5
|
+
the corresponding lines in the new version in place.
|
|
6
|
+
|
|
7
|
+
Settings that the user never changed get the new defaults automatically.
|
|
8
|
+
Settings the user explicitly changed are preserved at their original location.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import ast
|
|
12
|
+
import re
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# Settings that can be safely merged across upgrades.
|
|
16
|
+
# Hooks, imports, and computed values are always taken from the new version.
|
|
17
|
+
MERGEABLE_SETTINGS = {
|
|
18
|
+
"bind",
|
|
19
|
+
"workers",
|
|
20
|
+
"timeout",
|
|
21
|
+
"max_requests",
|
|
22
|
+
"max_requests_jitter",
|
|
23
|
+
"preload_app",
|
|
24
|
+
"proc_name",
|
|
25
|
+
"accesslog",
|
|
26
|
+
"errorlog",
|
|
27
|
+
"loglevel",
|
|
28
|
+
"capture_output",
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _load_conf_values(path):
|
|
33
|
+
"""Extract simple top-level assignments from a Python config file using AST.
|
|
34
|
+
|
|
35
|
+
Only extracts assignments of literal values (strings, numbers, booleans, None)
|
|
36
|
+
for settings in MERGEABLE_SETTINGS. Ignores function calls, imports, and
|
|
37
|
+
computed values — no code is executed.
|
|
38
|
+
"""
|
|
39
|
+
with open(path) as f:
|
|
40
|
+
tree = ast.parse(f.read(), filename=path)
|
|
41
|
+
|
|
42
|
+
values = {}
|
|
43
|
+
for node in ast.iter_child_nodes(tree):
|
|
44
|
+
if not isinstance(node, ast.Assign):
|
|
45
|
+
continue
|
|
46
|
+
if len(node.targets) != 1 or not isinstance(node.targets[0], ast.Name):
|
|
47
|
+
continue
|
|
48
|
+
name = node.targets[0].id
|
|
49
|
+
if name not in MERGEABLE_SETTINGS:
|
|
50
|
+
continue
|
|
51
|
+
try:
|
|
52
|
+
values[name] = ast.literal_eval(node.value)
|
|
53
|
+
except (ValueError, TypeError):
|
|
54
|
+
# Not a literal (e.g. min(cpu_count(), 4)) — skip, can't merge
|
|
55
|
+
pass
|
|
56
|
+
return values
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _load_conf_names(path):
|
|
60
|
+
"""Extract all top-level assignment names from a config file.
|
|
61
|
+
|
|
62
|
+
Unlike _load_conf_values, this returns names even for non-literal values
|
|
63
|
+
(e.g. min(cpu_count(), 4)), so we can detect which settings exist in the file.
|
|
64
|
+
"""
|
|
65
|
+
with open(path) as f:
|
|
66
|
+
tree = ast.parse(f.read(), filename=path)
|
|
67
|
+
names = set()
|
|
68
|
+
for node in ast.iter_child_nodes(tree):
|
|
69
|
+
if not isinstance(node, ast.Assign):
|
|
70
|
+
continue
|
|
71
|
+
if len(node.targets) != 1 or not isinstance(node.targets[0], ast.Name):
|
|
72
|
+
continue
|
|
73
|
+
names.add(node.targets[0].id)
|
|
74
|
+
return names
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _replace_setting_in_file(filepath, name, value):
|
|
78
|
+
"""Replace a setting's value in-place in a Python config file.
|
|
79
|
+
|
|
80
|
+
Finds the line matching `name = ...` and replaces it with the new value.
|
|
81
|
+
"""
|
|
82
|
+
with open(filepath) as f:
|
|
83
|
+
content = f.read()
|
|
84
|
+
|
|
85
|
+
# Match the assignment line: `name = <anything>` (not inside a comment)
|
|
86
|
+
pattern = re.compile(rf'^({re.escape(name)}\s*=\s*).*$', re.MULTILINE)
|
|
87
|
+
replacement = rf'\g<1>{value!r}'
|
|
88
|
+
new_content, count = pattern.subn(replacement, content)
|
|
89
|
+
|
|
90
|
+
if count > 0:
|
|
91
|
+
with open(filepath, "w") as f:
|
|
92
|
+
f.write(new_content)
|
|
93
|
+
return True
|
|
94
|
+
return False
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def upgrade_gunicorn_conf(old_conf, old_defaults, new_conf, new_defaults=None):
|
|
98
|
+
"""Upgrade gunicorn.conf.py preserving user customizations.
|
|
99
|
+
|
|
100
|
+
The new conf and defaults files should already be in their final location
|
|
101
|
+
(e.g. copied by install.sh). This function reads the OLD backups to detect
|
|
102
|
+
customizations, then replaces the corresponding values in the new conf.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
old_conf: path to backup of the previously installed gunicorn.conf.py
|
|
106
|
+
old_defaults: path to backup of the previously installed gunicorn.conf.defaults.py
|
|
107
|
+
new_conf: path to the new gunicorn.conf.py (already in place)
|
|
108
|
+
new_defaults: path to the new gunicorn.conf.defaults.py (already in place)
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
List of setting names that were preserved from the old config.
|
|
112
|
+
"""
|
|
113
|
+
# Load values from the old files
|
|
114
|
+
prev_defaults = _load_conf_values(old_defaults)
|
|
115
|
+
prev_conf = _load_conf_values(old_conf)
|
|
116
|
+
|
|
117
|
+
# Find settings the user explicitly changed from the original defaults
|
|
118
|
+
user_customizations = {}
|
|
119
|
+
for key in MERGEABLE_SETTINGS:
|
|
120
|
+
if key in prev_conf and key in prev_defaults:
|
|
121
|
+
if prev_conf[key] != prev_defaults[key]:
|
|
122
|
+
user_customizations[key] = prev_conf[key]
|
|
123
|
+
elif key in prev_conf and key not in prev_defaults:
|
|
124
|
+
# User added a setting that wasn't in the defaults (e.g. user/group)
|
|
125
|
+
user_customizations[key] = prev_conf[key]
|
|
126
|
+
|
|
127
|
+
# Only preserve customizations for settings that exist in the new defaults.
|
|
128
|
+
# Settings removed from the new defaults (e.g. worker_class, threads) are
|
|
129
|
+
# intentionally dropped — even if the user had customized them.
|
|
130
|
+
new_default_names = _load_conf_names(new_defaults) if new_defaults else set()
|
|
131
|
+
preserved = []
|
|
132
|
+
for key, value in user_customizations.items():
|
|
133
|
+
if new_default_names and key not in new_default_names:
|
|
134
|
+
continue
|
|
135
|
+
if _replace_setting_in_file(new_conf, key, value):
|
|
136
|
+
preserved.append(key)
|
|
137
|
+
|
|
138
|
+
return preserved
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -8,7 +8,6 @@ proxy, exact_cooc, sentence).
|
|
|
8
8
|
|
|
9
9
|
import os
|
|
10
10
|
|
|
11
|
-
import lmdb
|
|
12
11
|
import numba
|
|
13
12
|
import numpy as np
|
|
14
13
|
|
|
@@ -20,6 +19,8 @@ if not os.access(cache_dir, os.W_OK):
|
|
|
20
19
|
os.environ["NUMBA_CACHE_DIR"] = cache_dir
|
|
21
20
|
numba.config.CACHE_DIR = cache_dir
|
|
22
21
|
|
|
22
|
+
import lmdb
|
|
23
|
+
|
|
23
24
|
from philologic.runtime.Query import (
|
|
24
25
|
_find_doc_boundaries,
|
|
25
26
|
_load_word_arrays,
|
|
@@ -901,7 +902,6 @@ def search_phrase(db_path, hitlist_filename, overflow_words, corpus_file=None):
|
|
|
901
902
|
if not flushed:
|
|
902
903
|
output_file.flush()
|
|
903
904
|
flushed = True
|
|
904
|
-
|
|
905
905
|
env.close()
|
|
906
906
|
|
|
907
907
|
|
|
@@ -1208,5 +1208,5 @@ def _search_two_groups_batched(db_path, hitlist_filename, word_groups, overflow_
|
|
|
1208
1208
|
if not flushed:
|
|
1209
1209
|
output_file.flush()
|
|
1210
1210
|
flushed = True
|
|
1211
|
-
|
|
1212
1211
|
env.close()
|
|
1212
|
+
|
|
File without changes
|
|
File without changes
|
{philologic-5.2.0.2 → philologic-5.2.2}/philologic/runtime/reports/generate_word_frequency.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|