LZGraphs 3.0.0__tar.gz → 3.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/PKG-INFO +1 -1
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/include/lzgraph/edge_builder.h +4 -2
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/include/lzgraph/gene_data.h +2 -2
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/include/lzgraph/graph.h +32 -5
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/include/lzgraph/hash_map.h +16 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/include/lzgraph/io.h +4 -4
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/include/lzgraph/posterior.h +1 -1
- lzgraphs-3.0.1/include/lzgraph/walk_dict.h +232 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/lib/core/hash_map.c +88 -34
- lzgraphs-3.0.1/lib/graph/csr_graph.c +1854 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/lib/graph/edge_builder.c +18 -17
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/lib/graph/gene_data.c +2 -2
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/lib/graph/graph_ops.c +35 -50
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/lib/graph/lz76.c +103 -26
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/lib/io/io.c +58 -24
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/lib/posterior/posterior.c +5 -5
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/lib/simulation/genomic_simulate.c +5 -2
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/lib/simulation/simulate.c +72 -52
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/src/LZGraphs/__init__.py +1 -1
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/src/LZGraphs/_clzgraph.c +108 -29
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/src/LZGraphs/_graph.py +32 -2
- lzgraphs-3.0.1/src/LZGraphs/_io.py +598 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/src/LZGraphs/cli.py +175 -15
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/src/LZGraphs.egg-info/PKG-INFO +1 -1
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/src/LZGraphs.egg-info/SOURCES.txt +1 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/tests/test_cli.py +70 -1
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/tests/test_graph.py +84 -1
- lzgraphs-3.0.1/tests/test_io.py +60 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/tests/test_operations.py +17 -0
- lzgraphs-3.0.0/include/lzgraph/walk_dict.h +0 -123
- lzgraphs-3.0.0/lib/graph/csr_graph.c +0 -674
- lzgraphs-3.0.0/src/LZGraphs/_io.py +0 -191
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/CHANGELOG.md +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/CONTRIBUTING.md +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/LICENSE +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/MANIFEST.in +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/Makefile +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/README.md +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/include/lzgraph/analytics.h +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/include/lzgraph/common.h +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/include/lzgraph/crc32c.h +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/include/lzgraph/diversity.h +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/include/lzgraph/features.h +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/include/lzgraph/forward.h +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/include/lzgraph/genomic_simulate.h +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/include/lzgraph/graph_ops.h +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/include/lzgraph/lz76.h +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/include/lzgraph/occupancy.h +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/include/lzgraph/pgen_dist.h +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/include/lzgraph/rng.h +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/include/lzgraph/sharing.h +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/include/lzgraph/simulate.h +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/include/lzgraph/string_pool.h +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/include/lzgraph/wynn.h +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/lib/analytics/analytics.c +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/lib/analytics/diversity.c +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/lib/analytics/features.c +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/lib/analytics/pgen_dist.c +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/lib/core/common.c +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/lib/core/crc32c.c +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/lib/core/rng.c +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/lib/core/string_pool.c +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/lib/core/wynn.c +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/lib/graph/forward.c +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/lib/occupancy/occupancy.c +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/lib/occupancy/sharing.c +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/pyproject.toml +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/setup.cfg +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/setup.py +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/src/LZGraphs/_errors.py +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/src/LZGraphs/_pgen_dist.py +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/src/LZGraphs/_simulation_result.py +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/src/LZGraphs.egg-info/dependency_links.txt +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/src/LZGraphs.egg-info/entry_points.txt +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/src/LZGraphs.egg-info/requires.txt +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/src/LZGraphs.egg-info/top_level.txt +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/tests/test_analytics.py +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/tests/test_deep_audit.py +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/tests/test_known_distributions.py +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/tests/test_lzpgen.py +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/tests/test_mathematician_audit.py +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/tests/test_model_audit.py +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/tests/test_simulate.py +0 -0
- {lzgraphs-3.0.0 → lzgraphs-3.0.1}/tests/test_statistical_validity.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: LZGraphs
|
|
3
|
-
Version: 3.0.
|
|
3
|
+
Version: 3.0.1
|
|
4
4
|
Summary: High-performance LZ76 compression graphs for immune receptor repertoire analysis
|
|
5
5
|
Author-email: Thomas Konstantinovsky <thomaskon90@gmail.com>
|
|
6
6
|
Maintainer-email: Thomas Konstantinovsky <thomaskon90@gmail.com>
|
|
@@ -20,7 +20,7 @@ typedef struct LZGEdgeBuilder_ {
|
|
|
20
20
|
LZGHashMap *edge_map; /* key: pack(src,dst), value: edge index */
|
|
21
21
|
uint32_t *src_ids; /* [n_edges] source node IDs */
|
|
22
22
|
uint32_t *dst_ids; /* [n_edges] destination node IDs */
|
|
23
|
-
|
|
23
|
+
uint64_t *counts; /* [n_edges] raw transition counts */
|
|
24
24
|
uint32_t n_edges;
|
|
25
25
|
uint32_t capacity;
|
|
26
26
|
} LZGEdgeBuilder;
|
|
@@ -34,10 +34,12 @@ void lzg_eb_destroy(LZGEdgeBuilder *eb);
|
|
|
34
34
|
/**
|
|
35
35
|
* Record a transition from src → dst with the given count.
|
|
36
36
|
* If the edge already exists, increments its count.
|
|
37
|
+
* If `out_edge_idx` is non-NULL, returns the edge index that was updated.
|
|
37
38
|
*/
|
|
38
39
|
LZGError lzg_eb_record(LZGEdgeBuilder *eb,
|
|
39
40
|
uint32_t src_id, uint32_t dst_id,
|
|
40
|
-
|
|
41
|
+
uint64_t count,
|
|
42
|
+
uint32_t *out_edge_idx);
|
|
41
43
|
|
|
42
44
|
/** Pack key from (src, dst) pair. */
|
|
43
45
|
static inline uint64_t lzg_eb_pack_key(uint32_t src, uint32_t dst) {
|
|
@@ -26,13 +26,13 @@ typedef struct LZGGeneData_ {
|
|
|
26
26
|
/* ── Per-edge V gene counts (CSR-within-CSR) ── */
|
|
27
27
|
uint32_t *v_offsets; /* [n_edges + 1]: range per edge */
|
|
28
28
|
uint32_t *v_gene_ids; /* [total_v_entries]: interned gene IDs */
|
|
29
|
-
|
|
29
|
+
uint64_t *v_gene_counts; /* [total_v_entries]: raw counts */
|
|
30
30
|
uint32_t total_v_entries;
|
|
31
31
|
|
|
32
32
|
/* ── Per-edge J gene counts (CSR-within-CSR) ── */
|
|
33
33
|
uint32_t *j_offsets;
|
|
34
34
|
uint32_t *j_gene_ids;
|
|
35
|
-
|
|
35
|
+
uint64_t *j_gene_counts;
|
|
36
36
|
uint32_t total_j_entries;
|
|
37
37
|
|
|
38
38
|
/* ── Graph-level marginal distributions ── */
|
|
@@ -37,7 +37,7 @@ typedef struct LZGGraph_ {
|
|
|
37
37
|
|
|
38
38
|
/* ── Per-edge data (parallel to col_indices) ── */
|
|
39
39
|
double *edge_weights; /* [n_edges]: normalized P(dst|src) */
|
|
40
|
-
|
|
40
|
+
uint64_t *edge_counts; /* [n_edges]: raw transition counts */
|
|
41
41
|
|
|
42
42
|
/* ── Per-edge LZ constraint info ── */
|
|
43
43
|
uint32_t *edge_sp_id; /* [n_edges]: interned subpattern of dst */
|
|
@@ -45,7 +45,7 @@ typedef struct LZGGraph_ {
|
|
|
45
45
|
uint32_t *edge_prefix_id; /* [n_edges]: interned prefix of dst sp */
|
|
46
46
|
|
|
47
47
|
/* ── Per-node data ── */
|
|
48
|
-
|
|
48
|
+
uint64_t *outgoing_counts; /* [n_nodes]: total raw outgoing count */
|
|
49
49
|
uint32_t *node_sp_id; /* [n_nodes]: interned subpattern string */
|
|
50
50
|
uint8_t *node_sp_len; /* [n_nodes]: subpattern character length */
|
|
51
51
|
uint32_t *node_pos; /* [n_nodes]: cumulative position integer */
|
|
@@ -59,7 +59,7 @@ typedef struct LZGGraph_ {
|
|
|
59
59
|
bool topo_valid;
|
|
60
60
|
|
|
61
61
|
/* ── Length distribution ── */
|
|
62
|
-
|
|
62
|
+
uint64_t *length_counts; /* indexed by sequence length */
|
|
63
63
|
uint32_t max_length; /* largest observed sequence length */
|
|
64
64
|
|
|
65
65
|
/* ── String pool (owns all string data) ── */
|
|
@@ -71,6 +71,14 @@ typedef struct LZGGraph_ {
|
|
|
71
71
|
|
|
72
72
|
/* ── Gene data (optional, NULL if no gene columns provided) ── */
|
|
73
73
|
struct LZGGeneData_ *gene_data;
|
|
74
|
+
|
|
75
|
+
/* ── Transient query cache (not serialized) ── */
|
|
76
|
+
LZGHashMap *query_node_map; /* structural key -> node index */
|
|
77
|
+
uint64_t *edge_sp_hash; /* [n_edges]: hash of destination token */
|
|
78
|
+
uint64_t *edge_prefix_hash; /* [n_edges]: hash of prefix token or 0 */
|
|
79
|
+
uint64_t *node_sp_hash; /* [n_nodes]: hash of node token */
|
|
80
|
+
uint8_t *edge_single_char_idx; /* [n_edges]: aa bit index or UINT8_MAX */
|
|
81
|
+
uint8_t *node_single_char_idx; /* [n_nodes]: aa bit index or UINT8_MAX */
|
|
74
82
|
} LZGGraph;
|
|
75
83
|
|
|
76
84
|
/** Allocate and initialize an empty graph. */
|
|
@@ -95,12 +103,25 @@ void lzg_graph_destroy(LZGGraph *g);
|
|
|
95
103
|
LZGError lzg_graph_build(LZGGraph *g,
|
|
96
104
|
const char **sequences,
|
|
97
105
|
uint32_t n_seqs,
|
|
98
|
-
const
|
|
106
|
+
const uint64_t *abundances,
|
|
99
107
|
const char **v_genes,
|
|
100
108
|
const char **j_genes,
|
|
101
109
|
double smoothing,
|
|
102
110
|
uint32_t min_init);
|
|
103
111
|
|
|
112
|
+
/**
|
|
113
|
+
* Build a graph from a plain text file without materializing all sequences in Python.
|
|
114
|
+
*
|
|
115
|
+
* Supported input formats:
|
|
116
|
+
* - one sequence per line
|
|
117
|
+
* - sequence<TAB>abundance
|
|
118
|
+
*
|
|
119
|
+
* Gene columns and headered tabular formats are not supported by this path.
|
|
120
|
+
*/
|
|
121
|
+
LZGError lzg_graph_build_plain_file(LZGGraph *g,
|
|
122
|
+
const char *path,
|
|
123
|
+
double smoothing);
|
|
124
|
+
|
|
104
125
|
/** Compute topological sort (cached, invalidated on structural changes). */
|
|
105
126
|
LZGError lzg_graph_topo_sort(LZGGraph *g);
|
|
106
127
|
|
|
@@ -116,7 +137,7 @@ LZGError lzg_graph_finalize_from_edges(
|
|
|
116
137
|
LZGHashMap *initial_counts,
|
|
117
138
|
LZGHashMap *terminal_counts,
|
|
118
139
|
LZGHashMap *outgoing_counts,
|
|
119
|
-
|
|
140
|
+
uint64_t *len_counts, uint32_t max_len);
|
|
120
141
|
|
|
121
142
|
/* ── Recalculation flags ───────────────────────────────────── */
|
|
122
143
|
|
|
@@ -132,6 +153,12 @@ typedef enum {
|
|
|
132
153
|
*/
|
|
133
154
|
LZGError lzg_graph_recalculate(LZGGraph *g, uint32_t flags);
|
|
134
155
|
|
|
156
|
+
/**
|
|
157
|
+
* Internal: ensure transient query-side edge hash caches exist.
|
|
158
|
+
* These caches are derived from immutable edge metadata and are not serialized.
|
|
159
|
+
*/
|
|
160
|
+
LZGError lzg_graph_ensure_query_edge_hashes(LZGGraph *g);
|
|
161
|
+
|
|
135
162
|
/** Get the number of successors for node `node_id`. */
|
|
136
163
|
static inline uint32_t lzg_graph_out_degree(const LZGGraph *g, uint32_t node_id) {
|
|
137
164
|
return g->row_offsets[node_id + 1] - g->row_offsets[node_id];
|
|
@@ -27,6 +27,22 @@ bool lzg_hm_put(LZGHashMap *map, uint64_t key, uint64_t value);
|
|
|
27
27
|
/** Lookup. Returns pointer to value or NULL. */
|
|
28
28
|
uint64_t *lzg_hm_get(const LZGHashMap *map, uint64_t key);
|
|
29
29
|
|
|
30
|
+
/**
|
|
31
|
+
* Lookup or insert in a single probe walk. Returns a writable value slot.
|
|
32
|
+
* If the key is absent, inserts it with `initial_value`.
|
|
33
|
+
* If `inserted` is non-NULL, it is set to true iff a new entry was created.
|
|
34
|
+
*/
|
|
35
|
+
uint64_t *lzg_hm_get_or_insert(LZGHashMap *map, uint64_t key,
|
|
36
|
+
uint64_t initial_value, bool *inserted);
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Add `delta` to the value stored at `key`, inserting `delta` if absent.
|
|
40
|
+
* Returns a writable value slot after the update.
|
|
41
|
+
* If `inserted` is non-NULL, it is set to true iff a new entry was created.
|
|
42
|
+
*/
|
|
43
|
+
uint64_t *lzg_hm_add_u64(LZGHashMap *map, uint64_t key,
|
|
44
|
+
uint64_t delta, bool *inserted);
|
|
45
|
+
|
|
30
46
|
/** Delete. Returns true if found. */
|
|
31
47
|
bool lzg_hm_delete(LZGHashMap *map, uint64_t key);
|
|
32
48
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* @file io.h
|
|
3
|
-
* @brief
|
|
3
|
+
* @brief LZG binary file format: section-based save/load with CRC-32C.
|
|
4
4
|
*
|
|
5
5
|
* File extension: .lzg
|
|
6
6
|
* See FILE_FORMAT.md for the complete specification.
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
/* ── Format constants ──────────────────────────────────────── */
|
|
15
15
|
|
|
16
16
|
#define LZG_IO_MAGIC 0x4C5A4732u /* "LZG2" */
|
|
17
|
-
#define LZG_IO_FORMAT_VERSION
|
|
17
|
+
#define LZG_IO_FORMAT_VERSION 3
|
|
18
18
|
#define LZG_IO_ENDIAN_LE 0x01
|
|
19
19
|
#define LZG_IO_TRAILER_MAGIC 0x454E444Cu /* "ENDL" */
|
|
20
20
|
|
|
@@ -50,8 +50,8 @@
|
|
|
50
50
|
LZG_PACKED_BEGIN
|
|
51
51
|
typedef struct LZG_PACKED_ATTR {
|
|
52
52
|
uint32_t magic; /* 0x4C5A4732 */
|
|
53
|
-
uint16_t format_version; /*
|
|
54
|
-
uint16_t min_reader_version; /*
|
|
53
|
+
uint16_t format_version; /* current format version */
|
|
54
|
+
uint16_t min_reader_version; /* minimum supported reader version */
|
|
55
55
|
uint8_t endian_tag; /* 0x01 = LE */
|
|
56
56
|
uint8_t variant; /* 0=AAP, 1=NDP, 2=Naive */
|
|
57
57
|
uint8_t compression; /* 0=none, 1=zstd, 2=gzip */
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @file walk_dict.h
|
|
3
|
+
* @brief Per-walk LZ76 dictionary for dynamic constraint checking.
|
|
4
|
+
*
|
|
5
|
+
* Replaces the precomputed bitmask live index for simulation and walk
|
|
6
|
+
* probability. Each walk carries a lightweight hash dictionary that
|
|
7
|
+
* tracks which tokens have been emitted, enabling EXACT LZ76 constraint
|
|
8
|
+
* enforcement at O(degree) per step with no global precomputation.
|
|
9
|
+
*
|
|
10
|
+
* Correctness: The bitmask approach was an approximation —
|
|
11
|
+
* - Single-char: tracked "character in ANY token" (too restrictive)
|
|
12
|
+
* - Multi-char: tracked "first char of prefix seen" (too permissive)
|
|
13
|
+
*
|
|
14
|
+
* The walk dictionary checks the exact LZ76 conditions:
|
|
15
|
+
* - Single-char token "X": X was NOT a previous standalone token
|
|
16
|
+
* - Multi-char token "XY...Z": prefix "XY..." WAS a previous token
|
|
17
|
+
*/
|
|
18
|
+
#ifndef LZGRAPH_WALK_DICT_H
|
|
19
|
+
#define LZGRAPH_WALK_DICT_H
|
|
20
|
+
|
|
21
|
+
#include "lzgraph/common.h"
|
|
22
|
+
#include "lzgraph/graph.h"
|
|
23
|
+
#include "lzgraph/hash_map.h"
|
|
24
|
+
#include "lzgraph/string_pool.h"
|
|
25
|
+
#include <string.h>
|
|
26
|
+
|
|
27
|
+
#define LZG_WD_INLINE_CAP 256u
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Per-walk LZ76 dictionary state.
|
|
31
|
+
*
|
|
32
|
+
* Uses a small exact fixed-capacity token set in the hot path.
|
|
33
|
+
* If an unexpected walk exceeds that bound, we promote once into an
|
|
34
|
+
* overflow hash map and keep exact behavior.
|
|
35
|
+
*/
|
|
36
|
+
typedef struct {
|
|
37
|
+
uint64_t inline_keys[LZG_WD_INLINE_CAP];
|
|
38
|
+
uint8_t inline_used[LZG_WD_INLINE_CAP];
|
|
39
|
+
uint16_t inline_count;
|
|
40
|
+
uint32_t single_char_bits; /* bitmask: chars emitted as single-char */
|
|
41
|
+
LZGHashMap *overflow; /* exact fallback if inline set fills */
|
|
42
|
+
} LZGWalkDict;
|
|
43
|
+
|
|
44
|
+
static inline bool lzg_wd_inline_contains(const LZGWalkDict *wd, uint64_t key) {
|
|
45
|
+
uint32_t mask = LZG_WD_INLINE_CAP - 1u;
|
|
46
|
+
uint32_t idx = (uint32_t)(key & mask);
|
|
47
|
+
while (wd->inline_used[idx]) {
|
|
48
|
+
if (wd->inline_keys[idx] == key) return true;
|
|
49
|
+
idx = (idx + 1u) & mask;
|
|
50
|
+
}
|
|
51
|
+
return false;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
static inline bool lzg_wd_inline_insert(LZGWalkDict *wd, uint64_t key) {
|
|
55
|
+
/* Keep probe chains short; normal walks never approach this bound. */
|
|
56
|
+
if ((uint32_t)wd->inline_count * 4u >= LZG_WD_INLINE_CAP * 3u)
|
|
57
|
+
return false;
|
|
58
|
+
|
|
59
|
+
uint32_t mask = LZG_WD_INLINE_CAP - 1u;
|
|
60
|
+
uint32_t idx = (uint32_t)(key & mask);
|
|
61
|
+
while (wd->inline_used[idx]) {
|
|
62
|
+
if (wd->inline_keys[idx] == key) return true;
|
|
63
|
+
idx = (idx + 1u) & mask;
|
|
64
|
+
}
|
|
65
|
+
wd->inline_used[idx] = 1u;
|
|
66
|
+
wd->inline_keys[idx] = key;
|
|
67
|
+
wd->inline_count++;
|
|
68
|
+
return true;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
static inline bool lzg_wd_promote_overflow(LZGWalkDict *wd) {
|
|
72
|
+
if (!wd->overflow) {
|
|
73
|
+
wd->overflow = lzg_hm_create(LZG_WD_INLINE_CAP * 2u);
|
|
74
|
+
if (!wd->overflow) return false;
|
|
75
|
+
} else {
|
|
76
|
+
lzg_hm_clear(wd->overflow);
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
for (uint32_t i = 0; i < LZG_WD_INLINE_CAP; i++) {
|
|
80
|
+
if (!wd->inline_used[i]) continue;
|
|
81
|
+
lzg_hm_put(wd->overflow, wd->inline_keys[i], 1u);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
memset(wd->inline_used, 0, sizeof(wd->inline_used));
|
|
85
|
+
wd->inline_count = 0;
|
|
86
|
+
return true;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
static inline bool lzg_wd_contains_hash(const LZGWalkDict *wd, uint64_t key) {
|
|
90
|
+
if (wd->overflow) return lzg_hm_get(wd->overflow, key) != NULL;
|
|
91
|
+
return lzg_wd_inline_contains(wd, key);
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
static inline bool lzg_wd_record_hash(LZGWalkDict *wd, uint64_t key) {
|
|
95
|
+
if (wd->overflow) {
|
|
96
|
+
lzg_hm_put(wd->overflow, key, 1u);
|
|
97
|
+
return true;
|
|
98
|
+
}
|
|
99
|
+
if (lzg_wd_inline_insert(wd, key)) return true;
|
|
100
|
+
if (!lzg_wd_promote_overflow(wd)) return false;
|
|
101
|
+
lzg_hm_put(wd->overflow, key, 1u);
|
|
102
|
+
return true;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
static inline bool lzg_wd_record_prehashed(LZGWalkDict *wd, uint64_t key,
|
|
106
|
+
uint8_t single_char_idx) {
|
|
107
|
+
if (!lzg_wd_record_hash(wd, key)) return false;
|
|
108
|
+
if (single_char_idx != UINT8_MAX)
|
|
109
|
+
wd->single_char_bits |= (1u << single_char_idx);
|
|
110
|
+
return true;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
/** Create a walk dictionary. */
|
|
114
|
+
static inline LZGWalkDict lzg_wd_create(void) {
|
|
115
|
+
LZGWalkDict wd;
|
|
116
|
+
memset(&wd, 0, sizeof(wd));
|
|
117
|
+
return wd;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/** Reset a walk dictionary for reuse. */
|
|
121
|
+
static inline void lzg_wd_reset(LZGWalkDict *wd) {
|
|
122
|
+
memset(wd->inline_used, 0, sizeof(wd->inline_used));
|
|
123
|
+
wd->inline_count = 0;
|
|
124
|
+
wd->single_char_bits = 0;
|
|
125
|
+
if (wd->overflow) lzg_hm_clear(wd->overflow);
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/** Destroy a walk dictionary. */
|
|
129
|
+
static inline void lzg_wd_destroy(LZGWalkDict *wd) {
|
|
130
|
+
if (wd->overflow) {
|
|
131
|
+
lzg_hm_destroy(wd->overflow);
|
|
132
|
+
wd->overflow = NULL;
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
/**
|
|
137
|
+
* Record a token in the walk dictionary.
|
|
138
|
+
*
|
|
139
|
+
* @param wd Walk dictionary.
|
|
140
|
+
* @param pool String pool containing the token.
|
|
141
|
+
* @param sp_id Interned ID of the token string.
|
|
142
|
+
* @param sp_len Length of the token.
|
|
143
|
+
*/
|
|
144
|
+
static inline void lzg_wd_record(LZGWalkDict *wd, const LZGStringPool *pool,
|
|
145
|
+
uint32_t sp_id, uint8_t sp_len) {
|
|
146
|
+
const char *sp = lzg_sp_get(pool, sp_id);
|
|
147
|
+
uint64_t h = lzg_hash_bytes(sp, sp_len);
|
|
148
|
+
uint8_t single_char_idx = (sp_len == 1) ? lzg_aa_to_bit(sp[0]) : UINT8_MAX;
|
|
149
|
+
(void)lzg_wd_record_prehashed(wd, h, single_char_idx);
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* Check if an edge is LZ76-valid given the current walk dictionary.
|
|
154
|
+
*
|
|
155
|
+
* @param g The graph.
|
|
156
|
+
* @param edge Edge index.
|
|
157
|
+
* @param wd Current walk dictionary state.
|
|
158
|
+
* @return true if the edge's token is a valid next LZ76 token.
|
|
159
|
+
*/
|
|
160
|
+
/**
|
|
161
|
+
* Check if an edge is LZ76-valid for a CONTINUING walk (not the last step).
|
|
162
|
+
* Rules:
|
|
163
|
+
* - Single-char "c": c must NOT be a previous standalone token
|
|
164
|
+
* - Multi-char "wc": prefix "w" IN dictionary AND "wc" NOT in dictionary
|
|
165
|
+
*/
|
|
166
|
+
static inline bool lzg_wd_edge_valid(const LZGGraph *g, uint32_t edge,
|
|
167
|
+
const LZGWalkDict *wd) {
|
|
168
|
+
uint8_t sp_len = g->edge_sp_len[edge];
|
|
169
|
+
|
|
170
|
+
if (sp_len == 1) {
|
|
171
|
+
uint8_t idx = g->edge_single_char_idx ? g->edge_single_char_idx[edge] : UINT8_MAX;
|
|
172
|
+
if (idx != UINT8_MAX)
|
|
173
|
+
return !(wd->single_char_bits & (1u << idx));
|
|
174
|
+
const char *sp = lzg_sp_get(g->pool, g->edge_sp_id[edge]);
|
|
175
|
+
return !(wd->single_char_bits & (1u << lzg_aa_to_bit(sp[0])));
|
|
176
|
+
} else {
|
|
177
|
+
uint64_t prefix_h;
|
|
178
|
+
if (g->edge_prefix_hash) {
|
|
179
|
+
prefix_h = g->edge_prefix_hash[edge];
|
|
180
|
+
} else {
|
|
181
|
+
const char *prefix = lzg_sp_get(g->pool, g->edge_prefix_id[edge]);
|
|
182
|
+
uint32_t prefix_len = sp_len - 1;
|
|
183
|
+
prefix_h = lzg_hash_bytes(prefix, prefix_len);
|
|
184
|
+
}
|
|
185
|
+
if (!lzg_wd_contains_hash(wd, prefix_h))
|
|
186
|
+
return false;
|
|
187
|
+
|
|
188
|
+
uint64_t token_h;
|
|
189
|
+
if (g->edge_sp_hash) {
|
|
190
|
+
token_h = g->edge_sp_hash[edge];
|
|
191
|
+
} else {
|
|
192
|
+
const char *sp = lzg_sp_get(g->pool, g->edge_sp_id[edge]);
|
|
193
|
+
token_h = lzg_hash_bytes(sp, sp_len);
|
|
194
|
+
}
|
|
195
|
+
if (lzg_wd_contains_hash(wd, token_h))
|
|
196
|
+
return false;
|
|
197
|
+
|
|
198
|
+
return true;
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
/* lzg_wd_edge_valid_terminal removed: with $ sentinels, every last token
|
|
203
|
+
* is guaranteed novel (contains $), so case-3 exception is eliminated. */
|
|
204
|
+
|
|
205
|
+
/**
|
|
206
|
+
* Record a node's subpattern into the walk dictionary.
|
|
207
|
+
* Convenience for recording the initial node of a walk.
|
|
208
|
+
*/
|
|
209
|
+
static inline void lzg_wd_record_node(LZGWalkDict *wd, const LZGGraph *g,
|
|
210
|
+
uint32_t node_id) {
|
|
211
|
+
if (g->node_sp_hash && g->node_single_char_idx) {
|
|
212
|
+
(void)lzg_wd_record_prehashed(wd, g->node_sp_hash[node_id],
|
|
213
|
+
g->node_single_char_idx[node_id]);
|
|
214
|
+
} else {
|
|
215
|
+
lzg_wd_record(wd, g->pool, g->node_sp_id[node_id], g->node_sp_len[node_id]);
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
/**
|
|
220
|
+
* Record an edge's destination subpattern into the walk dictionary.
|
|
221
|
+
*/
|
|
222
|
+
static inline void lzg_wd_record_edge(LZGWalkDict *wd, const LZGGraph *g,
|
|
223
|
+
uint32_t edge) {
|
|
224
|
+
if (g->edge_sp_hash && g->edge_single_char_idx) {
|
|
225
|
+
(void)lzg_wd_record_prehashed(wd, g->edge_sp_hash[edge],
|
|
226
|
+
g->edge_single_char_idx[edge]);
|
|
227
|
+
} else {
|
|
228
|
+
lzg_wd_record(wd, g->pool, g->edge_sp_id[edge], g->edge_sp_len[edge]);
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
#endif /* LZGRAPH_WALK_DICT_H */
|
|
@@ -11,6 +11,45 @@ static uint32_t next_pow2(uint32_t v) {
|
|
|
11
11
|
v |= v >> 8; v |= v >> 16; return v + 1;
|
|
12
12
|
}
|
|
13
13
|
|
|
14
|
+
static inline uint64_t hm_mix64(uint64_t x) {
|
|
15
|
+
x ^= x >> 33;
|
|
16
|
+
x *= 0xff51afd7ed558ccdULL;
|
|
17
|
+
x ^= x >> 33;
|
|
18
|
+
x *= 0xc4ceb9fe1a85ec53ULL;
|
|
19
|
+
x ^= x >> 33;
|
|
20
|
+
return x;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
static uint32_t hm_find_existing(const LZGHashMap *m, uint64_t key) {
|
|
24
|
+
uint32_t mask = m->capacity - 1;
|
|
25
|
+
uint32_t idx = (uint32_t)(hm_mix64(key) & mask);
|
|
26
|
+
while (1) {
|
|
27
|
+
uint64_t k = m->keys[idx];
|
|
28
|
+
if (k == key) return idx;
|
|
29
|
+
if (k == LZG_HM_EMPTY) return UINT32_MAX;
|
|
30
|
+
idx = (idx + 1) & mask;
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
static uint32_t hm_find_insert_slot(const LZGHashMap *m, uint64_t key, bool *found) {
|
|
35
|
+
uint32_t mask = m->capacity - 1;
|
|
36
|
+
uint32_t idx = (uint32_t)(hm_mix64(key) & mask);
|
|
37
|
+
uint32_t tomb = UINT32_MAX;
|
|
38
|
+
while (1) {
|
|
39
|
+
uint64_t k = m->keys[idx];
|
|
40
|
+
if (k == key) {
|
|
41
|
+
*found = true;
|
|
42
|
+
return idx;
|
|
43
|
+
}
|
|
44
|
+
if (k == LZG_HM_EMPTY) {
|
|
45
|
+
*found = false;
|
|
46
|
+
return (tomb != UINT32_MAX) ? tomb : idx;
|
|
47
|
+
}
|
|
48
|
+
if (k == LZG_HM_DELETED && tomb == UINT32_MAX) tomb = idx;
|
|
49
|
+
idx = (idx + 1) & mask;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
14
53
|
LZGHashMap *lzg_hm_create(uint32_t initial_capacity) {
|
|
15
54
|
if (initial_capacity < 16) initial_capacity = 16;
|
|
16
55
|
initial_capacity = next_pow2(initial_capacity);
|
|
@@ -42,10 +81,10 @@ static void hm_resize(LZGHashMap *m, uint32_t new_cap) {
|
|
|
42
81
|
m->capacity = new_cap;
|
|
43
82
|
m->count = m->tombstones = 0;
|
|
44
83
|
memset(m->keys, 0xFF, new_cap * sizeof(uint64_t));
|
|
45
|
-
uint32_t mask = new_cap - 1;
|
|
46
84
|
for (uint32_t i = 0; i < oc; i++) {
|
|
47
85
|
if (ok[i] != LZG_HM_EMPTY && ok[i] != LZG_HM_DELETED) {
|
|
48
|
-
uint32_t
|
|
86
|
+
uint32_t mask = new_cap - 1;
|
|
87
|
+
uint32_t idx = (uint32_t)(hm_mix64(ok[i]) & mask);
|
|
49
88
|
while (m->keys[idx] != LZG_HM_EMPTY) idx = (idx + 1) & mask;
|
|
50
89
|
m->keys[idx] = ok[i];
|
|
51
90
|
m->values[idx] = ov[i];
|
|
@@ -58,46 +97,61 @@ static void hm_resize(LZGHashMap *m, uint32_t new_cap) {
|
|
|
58
97
|
bool lzg_hm_put(LZGHashMap *m, uint64_t key, uint64_t value) {
|
|
59
98
|
if ((m->count + m->tombstones) * 5 > m->capacity * 3)
|
|
60
99
|
hm_resize(m, m->capacity * 2);
|
|
61
|
-
|
|
62
|
-
uint32_t idx = (
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
if (k == key) { m->values[idx] = value; return false; }
|
|
67
|
-
if (k == LZG_HM_EMPTY) {
|
|
68
|
-
if (tomb != UINT32_MAX) { idx = tomb; m->tombstones--; }
|
|
69
|
-
m->keys[idx] = key; m->values[idx] = value; m->count++;
|
|
70
|
-
return true;
|
|
71
|
-
}
|
|
72
|
-
if (k == LZG_HM_DELETED && tomb == UINT32_MAX) tomb = idx;
|
|
73
|
-
idx = (idx + 1) & mask;
|
|
100
|
+
bool found = false;
|
|
101
|
+
uint32_t idx = hm_find_insert_slot(m, key, &found);
|
|
102
|
+
if (found) {
|
|
103
|
+
m->values[idx] = value;
|
|
104
|
+
return false;
|
|
74
105
|
}
|
|
106
|
+
if (m->keys[idx] == LZG_HM_DELETED) m->tombstones--;
|
|
107
|
+
m->keys[idx] = key;
|
|
108
|
+
m->values[idx] = value;
|
|
109
|
+
m->count++;
|
|
110
|
+
return true;
|
|
75
111
|
}
|
|
76
112
|
|
|
77
113
|
uint64_t *lzg_hm_get(const LZGHashMap *m, uint64_t key) {
|
|
78
|
-
uint32_t
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
114
|
+
uint32_t idx = hm_find_existing(m, key);
|
|
115
|
+
if (idx == UINT32_MAX) return NULL;
|
|
116
|
+
return (uint64_t *)&m->values[idx];
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
uint64_t *lzg_hm_get_or_insert(LZGHashMap *m, uint64_t key,
|
|
120
|
+
uint64_t initial_value, bool *inserted) {
|
|
121
|
+
if ((m->count + m->tombstones) * 5 > m->capacity * 3)
|
|
122
|
+
hm_resize(m, m->capacity * 2);
|
|
123
|
+
|
|
124
|
+
bool found = false;
|
|
125
|
+
uint32_t idx = hm_find_insert_slot(m, key, &found);
|
|
126
|
+
if (found) {
|
|
127
|
+
if (inserted) *inserted = false;
|
|
128
|
+
return &m->values[idx];
|
|
85
129
|
}
|
|
130
|
+
|
|
131
|
+
if (m->keys[idx] == LZG_HM_DELETED) m->tombstones--;
|
|
132
|
+
m->keys[idx] = key;
|
|
133
|
+
m->values[idx] = initial_value;
|
|
134
|
+
m->count++;
|
|
135
|
+
if (inserted) *inserted = true;
|
|
136
|
+
return &m->values[idx];
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
uint64_t *lzg_hm_add_u64(LZGHashMap *m, uint64_t key,
|
|
140
|
+
uint64_t delta, bool *inserted) {
|
|
141
|
+
bool created = false;
|
|
142
|
+
uint64_t *slot = lzg_hm_get_or_insert(m, key, delta, &created);
|
|
143
|
+
if (!created) *slot += delta;
|
|
144
|
+
if (inserted) *inserted = created;
|
|
145
|
+
return slot;
|
|
86
146
|
}
|
|
87
147
|
|
|
88
148
|
bool lzg_hm_delete(LZGHashMap *m, uint64_t key) {
|
|
89
|
-
uint32_t
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
m->count--; m->tombstones++;
|
|
96
|
-
return true;
|
|
97
|
-
}
|
|
98
|
-
if (k == LZG_HM_EMPTY) return false;
|
|
99
|
-
idx = (idx + 1) & mask;
|
|
100
|
-
}
|
|
149
|
+
uint32_t idx = hm_find_existing(m, key);
|
|
150
|
+
if (idx == UINT32_MAX) return false;
|
|
151
|
+
m->keys[idx] = LZG_HM_DELETED;
|
|
152
|
+
m->count--;
|
|
153
|
+
m->tombstones++;
|
|
154
|
+
return true;
|
|
101
155
|
}
|
|
102
156
|
|
|
103
157
|
void lzg_hm_clear(LZGHashMap *m) {
|