compiled-knowledge 4.0.0a25__cp312-cp312-win_amd64.whl → 4.1.0__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of compiled-knowledge might be problematic. Click here for more details.
- ck/circuit/_circuit_cy.c +1 -1
- ck/circuit/_circuit_cy.cp312-win_amd64.pyd +0 -0
- ck/circuit_compiler/cython_vm_compiler/_compiler.c +152 -152
- ck/circuit_compiler/cython_vm_compiler/_compiler.cp312-win_amd64.pyd +0 -0
- ck/circuit_compiler/interpret_compiler.py +2 -2
- ck/circuit_compiler/llvm_compiler.py +4 -4
- ck/circuit_compiler/support/circuit_analyser/_circuit_analyser_cy.c +1 -1
- ck/circuit_compiler/support/circuit_analyser/_circuit_analyser_cy.cp312-win_amd64.pyd +0 -0
- ck/circuit_compiler/support/input_vars.py +4 -4
- ck/dataset/__init__.py +1 -0
- ck/dataset/cross_table.py +334 -0
- ck/dataset/dataset.py +682 -0
- ck/dataset/dataset_builder.py +519 -0
- ck/dataset/dataset_compute.py +140 -0
- ck/dataset/dataset_from_crosstable.py +64 -0
- ck/dataset/dataset_from_csv.py +151 -0
- ck/dataset/sampled_dataset.py +96 -0
- ck/learning/__init__.py +0 -0
- ck/learning/coalesce_cross_tables.py +403 -0
- ck/learning/model_from_cross_tables.py +296 -0
- ck/learning/parameters.py +117 -0
- ck/learning/train_generative_bn.py +198 -0
- ck/pgm.py +39 -35
- ck/pgm_circuit/marginals_program.py +5 -0
- ck/pgm_circuit/program_with_slotmap.py +23 -45
- ck/pgm_circuit/support/compile_circuit.py +2 -4
- ck/pgm_circuit/wmc_program.py +5 -0
- ck/pgm_compiler/support/circuit_table/_circuit_table_cy.c +1 -1
- ck/pgm_compiler/support/circuit_table/_circuit_table_cy.cp312-win_amd64.pyd +0 -0
- ck/probability/cross_table_probability_space.py +53 -0
- ck/probability/divergence.py +226 -0
- ck/probability/empirical_probability_space.py +1 -0
- ck/probability/probability_space.py +43 -19
- ck_demos/dataset/__init__.py +0 -0
- ck_demos/dataset/demo_dataset_builder.py +37 -0
- ck_demos/dataset/demo_dataset_from_sampler.py +18 -0
- ck_demos/learning/__init__.py +0 -0
- ck_demos/learning/demo_bayesian_network_from_cross_tables.py +70 -0
- ck_demos/learning/demo_simple_learning.py +55 -0
- ck_demos/sampling/demo_wmc_direct_sampler.py +2 -2
- {compiled_knowledge-4.0.0a25.dist-info → compiled_knowledge-4.1.0.dist-info}/METADATA +2 -1
- {compiled_knowledge-4.0.0a25.dist-info → compiled_knowledge-4.1.0.dist-info}/RECORD +45 -24
- {compiled_knowledge-4.0.0a25.dist-info → compiled_knowledge-4.1.0.dist-info}/WHEEL +0 -0
- {compiled_knowledge-4.0.0a25.dist-info → compiled_knowledge-4.1.0.dist-info}/licenses/LICENSE.txt +0 -0
- {compiled_knowledge-4.0.0a25.dist-info → compiled_knowledge-4.1.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
from typing import Sequence
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
from ck.dataset import HardDataset, SoftDataset
|
|
6
|
+
from ck.dataset.cross_table import CrossTable, cross_table_from_soft_dataset
|
|
7
|
+
from ck.pgm import RandomVariable
|
|
8
|
+
from ck.utils.np_extras import dtype_for_number_of_states
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def dataset_from_cross_table(cross_table: CrossTable) -> HardDataset:
|
|
12
|
+
"""
|
|
13
|
+
Construct a HardDataset from the given cross-table.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
cross_table: A cross-table represented as a dictionary.
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
A dataset where instances and instance weights are those of the
|
|
20
|
+
given cross-table.
|
|
21
|
+
|
|
22
|
+
Ensures:
|
|
23
|
+
`result.total_weight() == dataset.total_weight()`.
|
|
24
|
+
Zero weighted instances are not counted.
|
|
25
|
+
"""
|
|
26
|
+
rvs: Sequence[RandomVariable] = cross_table.rvs
|
|
27
|
+
|
|
28
|
+
# Unzip the cross-table dictionary
|
|
29
|
+
rvs_series = [[] for _ in range(len(rvs))]
|
|
30
|
+
weights = []
|
|
31
|
+
for instance, weight in cross_table.items():
|
|
32
|
+
for series, state in zip(rvs_series, instance):
|
|
33
|
+
series.append(state)
|
|
34
|
+
weights.append(weight)
|
|
35
|
+
|
|
36
|
+
# Put the hard dataset together
|
|
37
|
+
return HardDataset(
|
|
38
|
+
data=(
|
|
39
|
+
(rv, np.array(series, dtype=dtype_for_number_of_states(len(rv))))
|
|
40
|
+
for rv, series in zip(rvs, rvs_series)
|
|
41
|
+
),
|
|
42
|
+
weights=np.array(weights, dtype=np.float64),
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def expand_soft_dataset(soft_dataset: SoftDataset) -> HardDataset:
|
|
47
|
+
"""
|
|
48
|
+
Construct a hard dataset with the same data semantics as the given soft dataset
|
|
49
|
+
by expanding soft evidence.
|
|
50
|
+
|
|
51
|
+
Any state weights in `soft_dataset` that represents uncertainty over states
|
|
52
|
+
of a random variable will be converted to an equivalent set of weighted hard
|
|
53
|
+
instances. This means that the returned dataset may have a number of instances
|
|
54
|
+
different to that of the given soft dataset.
|
|
55
|
+
|
|
56
|
+
The ordering of instances in the returned dataset is not guaranteed.
|
|
57
|
+
|
|
58
|
+
This method works by constructing a cross-table from the given soft dataset,
|
|
59
|
+
then converting the crosstable to a hard dataset using `dataset_from_cross_table`.
|
|
60
|
+
This implies that the result will have no duplicated instances and no
|
|
61
|
+
instances with weight zero.
|
|
62
|
+
"""
|
|
63
|
+
crosstab: CrossTable = cross_table_from_soft_dataset(soft_dataset)
|
|
64
|
+
return dataset_from_cross_table(crosstab)
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
from typing import Iterable, List, Sequence, Optional
|
|
2
|
+
|
|
3
|
+
from ck.dataset import HardDataset
|
|
4
|
+
from ck.pgm import RandomVariable
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def hard_dataset_from_csv(
|
|
8
|
+
rvs: Iterable[RandomVariable],
|
|
9
|
+
lines: Iterable[str],
|
|
10
|
+
*,
|
|
11
|
+
weights: Optional[int | str] = None,
|
|
12
|
+
sep: Optional[str] = ',',
|
|
13
|
+
comment: str = '#',
|
|
14
|
+
) -> HardDataset:
|
|
15
|
+
"""
|
|
16
|
+
Interpret the given sequence of lines as CSV for a HardDataset.
|
|
17
|
+
|
|
18
|
+
Each line is a list of state indexes (ints) separated by `sep`.
|
|
19
|
+
|
|
20
|
+
Every line should have the same number of values.
|
|
21
|
+
|
|
22
|
+
If the first line contains a non-integer value, then the first
|
|
23
|
+
line will be interpreted as a header line.
|
|
24
|
+
|
|
25
|
+
If there is no header line, then the values will be interpreted in the
|
|
26
|
+
same order as `rvs` and the number of values on each line should be
|
|
27
|
+
the same as the number of random variables in `rvs`.
|
|
28
|
+
|
|
29
|
+
If there is a header line, then it will be interpreted as the order
|
|
30
|
+
of random variables. There must be a column name in the header to match
|
|
31
|
+
each name of the given random variables. Additional columns will be ignored.
|
|
32
|
+
|
|
33
|
+
Leading and trailing whitespace is ignored for each field, including header column names.
|
|
34
|
+
|
|
35
|
+
As text file (and StringIO) objects are iterable over lines, here is how to read a csv file:
|
|
36
|
+
```
|
|
37
|
+
with open(csv_filename, 'r') as file:
|
|
38
|
+
hard_dataset_from_csv(rvs, file)
|
|
39
|
+
```
|
|
40
|
+
Here is an example to read from a csv string:
|
|
41
|
+
```
|
|
42
|
+
hard_dataset_from_csv(rvs, csv_string.splitlines())
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
rvs: the random variables for the returned dataset.
|
|
47
|
+
lines: the sequence of lines to interpret, each line is an instance in the dataset.
|
|
48
|
+
weights: the column in the csv file holding instance weights. Can be either the
|
|
49
|
+
column number (counting from zero) or a column name (requires a header line).
|
|
50
|
+
sep: the string to use to separate values in a line, default is a comma.
|
|
51
|
+
If set to `None`, lines will be split on any consecutive run of whitespace characters
|
|
52
|
+
(including \n \r \t \f and spaces).
|
|
53
|
+
comment: text starting with this will be treated as a comment. Set to '' to disallow comments.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
a HardDataset.
|
|
57
|
+
|
|
58
|
+
Raises:
|
|
59
|
+
ValueError: if the lines do not conform to a CSV format.
|
|
60
|
+
"""
|
|
61
|
+
rvs: Sequence[RandomVariable] = tuple(rvs)
|
|
62
|
+
|
|
63
|
+
# Define `clean_line` being sensitive to comments.
|
|
64
|
+
if len(comment) > 0:
|
|
65
|
+
def clean_line(l: str) -> str:
|
|
66
|
+
i = l.find(comment)
|
|
67
|
+
if i >= 0:
|
|
68
|
+
l = l[:i]
|
|
69
|
+
return l.strip()
|
|
70
|
+
else:
|
|
71
|
+
def clean_line(l: str) -> str:
|
|
72
|
+
return l.strip()
|
|
73
|
+
|
|
74
|
+
# Get the first line which may be a header line or data line
|
|
75
|
+
it = iter(lines)
|
|
76
|
+
try:
|
|
77
|
+
while True:
|
|
78
|
+
line = clean_line(next(it))
|
|
79
|
+
if len(line) > 0:
|
|
80
|
+
break
|
|
81
|
+
except StopIteration:
|
|
82
|
+
# Empty dataset with the given random variables
|
|
83
|
+
return HardDataset((rv, []) for rv in rvs)
|
|
84
|
+
|
|
85
|
+
values: List[str] = [value.strip() for value in line.split(sep)]
|
|
86
|
+
number_of_columns: int = len(values)
|
|
87
|
+
series: List[List[int]] # series[dataset-column] = list of values
|
|
88
|
+
weight_series: Optional[List[float]] = None
|
|
89
|
+
column_map: List[int] # column_map[dataset-column] = input-column
|
|
90
|
+
if all(_is_number(value) for value in values):
|
|
91
|
+
# First line is not a header line
|
|
92
|
+
if weights is None:
|
|
93
|
+
if number_of_columns != len(rvs):
|
|
94
|
+
raise ValueError('number of columns does not match number of random variables')
|
|
95
|
+
column_map = list(range(len(rvs)))
|
|
96
|
+
else:
|
|
97
|
+
if number_of_columns != len(rvs) + 1:
|
|
98
|
+
raise ValueError('number of columns does not match number of random variables and weight column')
|
|
99
|
+
if not isinstance(weights, int):
|
|
100
|
+
raise ValueError('no header detected - `weights` must be a column number')
|
|
101
|
+
if not (-number_of_columns <= weights < number_of_columns):
|
|
102
|
+
raise ValueError('`weights` column number out of range')
|
|
103
|
+
column_map = list(range(len(rvs) + 1))
|
|
104
|
+
column_map.pop(weights)
|
|
105
|
+
|
|
106
|
+
# Initialise series with the first line of data
|
|
107
|
+
series = [[int(values[i])] for i in column_map]
|
|
108
|
+
if weights is not None:
|
|
109
|
+
weight_series = [float(values[weights])]
|
|
110
|
+
|
|
111
|
+
else:
|
|
112
|
+
# First line is a header line
|
|
113
|
+
# Lookup each random variable to find its column
|
|
114
|
+
column_map = [
|
|
115
|
+
values.index(rv.name) # will raise ValueError if not found
|
|
116
|
+
for rv in rvs
|
|
117
|
+
]
|
|
118
|
+
if isinstance(weights, str):
|
|
119
|
+
# Convert weights column name to column number
|
|
120
|
+
weights: int = values.index(weights) # will raise ValueError if not found
|
|
121
|
+
elif isinstance(weights, int) and not (number_of_columns <= weights < number_of_columns):
|
|
122
|
+
raise ValueError('`weights` column number out of range')
|
|
123
|
+
|
|
124
|
+
# Initialise each series as empty
|
|
125
|
+
series = [[] for _ in rvs]
|
|
126
|
+
if weights is not None:
|
|
127
|
+
weight_series = []
|
|
128
|
+
|
|
129
|
+
# Read remaining data lines
|
|
130
|
+
for line in it:
|
|
131
|
+
line = clean_line(line)
|
|
132
|
+
if len(line) == 0:
|
|
133
|
+
continue
|
|
134
|
+
if len(values) != number_of_columns:
|
|
135
|
+
raise ValueError('number of values does not match number of columns')
|
|
136
|
+
values = line.split(sep)
|
|
137
|
+
for series_i, i in zip(series, column_map):
|
|
138
|
+
series_i.append(int(values[i]))
|
|
139
|
+
if weights is not None:
|
|
140
|
+
weight_series.append(float(values[weights]))
|
|
141
|
+
|
|
142
|
+
# Construct the dataset
|
|
143
|
+
return HardDataset(zip(rvs, series), weights=weight_series)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _is_number(s: str) -> bool:
|
|
147
|
+
try:
|
|
148
|
+
float(s)
|
|
149
|
+
return True
|
|
150
|
+
except ValueError:
|
|
151
|
+
return False
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import random
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Sequence, List, Iterator, Tuple, Dict
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
from ck.dataset import HardDataset
|
|
8
|
+
from ck.dataset.cross_table import CrossTable
|
|
9
|
+
from ck.pgm import RandomVariable, Instance
|
|
10
|
+
from ck.sampling.sampler import Sampler
|
|
11
|
+
from ck.utils.np_extras import dtype_for_number_of_states, NDArray
|
|
12
|
+
from ck.utils.random_extras import Random
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def dataset_from_sampler(sampler: Sampler, length: int) -> HardDataset:
|
|
16
|
+
"""
|
|
17
|
+
Create a hard dataset using samples from a sampler.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
sampler: A sampler which defined the random variables and provides samples.
|
|
21
|
+
length: The length of the dataset to create.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
A HardDataset of the given length.
|
|
25
|
+
"""
|
|
26
|
+
rvs: Sequence[RandomVariable] = sampler.rvs
|
|
27
|
+
columns: List[NDArray] = [
|
|
28
|
+
np.zeros(length, dtype=dtype_for_number_of_states(len(rv)))
|
|
29
|
+
for rv in rvs
|
|
30
|
+
]
|
|
31
|
+
for i, instance in enumerate(sampler.take(length)):
|
|
32
|
+
for column, state in zip(columns, instance):
|
|
33
|
+
column[i] = state
|
|
34
|
+
return HardDataset(zip(rvs, columns))
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class CrossTableSampler(Sampler):
|
|
38
|
+
def __init__(self, crosstab: CrossTable, rand: Random = random):
|
|
39
|
+
"""
|
|
40
|
+
Adapt a cross table to a sampler.
|
|
41
|
+
|
|
42
|
+
Instances will be drawn from the sampler according to their
|
|
43
|
+
weight in the given cross-table. If the given cross-table is
|
|
44
|
+
modified after constructing the sampler, the sampler will not
|
|
45
|
+
be affected.
|
|
46
|
+
"""
|
|
47
|
+
if len(crosstab) == 0:
|
|
48
|
+
raise ValueError('no instances to sample')
|
|
49
|
+
|
|
50
|
+
super().__init__(rvs=crosstab.rvs, condition=())
|
|
51
|
+
|
|
52
|
+
# Group instances by weight.
|
|
53
|
+
# We do this in anticipation that it makes sampling more efficient.
|
|
54
|
+
weight_groups: Dict[float, _WeightGroup] = {}
|
|
55
|
+
for instance, weight in crosstab.items():
|
|
56
|
+
weight_group = weight_groups.get(weight)
|
|
57
|
+
if weight_group is None:
|
|
58
|
+
weight_groups[weight] = _WeightGroup(weight, weight, [instance])
|
|
59
|
+
else:
|
|
60
|
+
weight_group.append(instance)
|
|
61
|
+
|
|
62
|
+
self._weight_groups: List[_WeightGroup] = list(weight_groups.values())
|
|
63
|
+
self._total_weight = sum(group.total for group in weight_groups.values())
|
|
64
|
+
self._rand = rand
|
|
65
|
+
|
|
66
|
+
def __iter__(self) -> Iterator[Instance]:
|
|
67
|
+
while True:
|
|
68
|
+
# This code performs inverse transform sampling
|
|
69
|
+
r: float = self._rand.random() * self._total_weight
|
|
70
|
+
|
|
71
|
+
# This does a serial search to find the weight group.
|
|
72
|
+
# This is efficient for small numbers of groups, but this may be
|
|
73
|
+
# improved for large numbers of groups.
|
|
74
|
+
it = iter(self._weight_groups)
|
|
75
|
+
group = next(it)
|
|
76
|
+
while r >= group.total:
|
|
77
|
+
r -= group.total
|
|
78
|
+
group = next(it)
|
|
79
|
+
|
|
80
|
+
# Pick an instance in the group
|
|
81
|
+
i = int(r / group.weight)
|
|
82
|
+
yield group.instances[i]
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@dataclass
|
|
86
|
+
class _WeightGroup:
|
|
87
|
+
"""
|
|
88
|
+
Support for CrossTableSampler.
|
|
89
|
+
"""
|
|
90
|
+
weight: float
|
|
91
|
+
total: float
|
|
92
|
+
instances: List[Tuple[int, ...]]
|
|
93
|
+
|
|
94
|
+
def append(self, instance: Tuple[int, ...]) -> None:
|
|
95
|
+
self.total += self.weight
|
|
96
|
+
self.instances.append(instance)
|
ck/learning/__init__.py
ADDED
|
File without changes
|