compiled-knowledge 4.0.0a25__cp312-cp312-macosx_11_0_arm64.whl → 4.1.0__cp312-cp312-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of compiled-knowledge might be problematic. Click here for more details.

Files changed (45) hide show
  1. ck/circuit/_circuit_cy.c +1 -1
  2. ck/circuit/_circuit_cy.cpython-312-darwin.so +0 -0
  3. ck/circuit_compiler/cython_vm_compiler/_compiler.c +152 -152
  4. ck/circuit_compiler/cython_vm_compiler/_compiler.cpython-312-darwin.so +0 -0
  5. ck/circuit_compiler/interpret_compiler.py +2 -2
  6. ck/circuit_compiler/llvm_compiler.py +4 -4
  7. ck/circuit_compiler/support/circuit_analyser/_circuit_analyser_cy.c +1 -1
  8. ck/circuit_compiler/support/circuit_analyser/_circuit_analyser_cy.cpython-312-darwin.so +0 -0
  9. ck/circuit_compiler/support/input_vars.py +4 -4
  10. ck/dataset/__init__.py +1 -0
  11. ck/dataset/cross_table.py +334 -0
  12. ck/dataset/dataset.py +682 -0
  13. ck/dataset/dataset_builder.py +519 -0
  14. ck/dataset/dataset_compute.py +140 -0
  15. ck/dataset/dataset_from_crosstable.py +64 -0
  16. ck/dataset/dataset_from_csv.py +151 -0
  17. ck/dataset/sampled_dataset.py +96 -0
  18. ck/learning/__init__.py +0 -0
  19. ck/learning/coalesce_cross_tables.py +403 -0
  20. ck/learning/model_from_cross_tables.py +296 -0
  21. ck/learning/parameters.py +117 -0
  22. ck/learning/train_generative_bn.py +198 -0
  23. ck/pgm.py +39 -35
  24. ck/pgm_circuit/marginals_program.py +5 -0
  25. ck/pgm_circuit/program_with_slotmap.py +23 -45
  26. ck/pgm_circuit/support/compile_circuit.py +2 -4
  27. ck/pgm_circuit/wmc_program.py +5 -0
  28. ck/pgm_compiler/support/circuit_table/_circuit_table_cy.c +1 -1
  29. ck/pgm_compiler/support/circuit_table/_circuit_table_cy.cpython-312-darwin.so +0 -0
  30. ck/probability/cross_table_probability_space.py +53 -0
  31. ck/probability/divergence.py +226 -0
  32. ck/probability/empirical_probability_space.py +1 -0
  33. ck/probability/probability_space.py +43 -19
  34. ck_demos/dataset/__init__.py +0 -0
  35. ck_demos/dataset/demo_dataset_builder.py +37 -0
  36. ck_demos/dataset/demo_dataset_from_sampler.py +18 -0
  37. ck_demos/learning/__init__.py +0 -0
  38. ck_demos/learning/demo_bayesian_network_from_cross_tables.py +70 -0
  39. ck_demos/learning/demo_simple_learning.py +55 -0
  40. ck_demos/sampling/demo_wmc_direct_sampler.py +2 -2
  41. {compiled_knowledge-4.0.0a25.dist-info → compiled_knowledge-4.1.0.dist-info}/METADATA +2 -1
  42. {compiled_knowledge-4.0.0a25.dist-info → compiled_knowledge-4.1.0.dist-info}/RECORD +45 -24
  43. {compiled_knowledge-4.0.0a25.dist-info → compiled_knowledge-4.1.0.dist-info}/WHEEL +0 -0
  44. {compiled_knowledge-4.0.0a25.dist-info → compiled_knowledge-4.1.0.dist-info}/licenses/LICENSE.txt +0 -0
  45. {compiled_knowledge-4.0.0a25.dist-info → compiled_knowledge-4.1.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,64 @@
1
+ from typing import Sequence
2
+
3
+ import numpy as np
4
+
5
+ from ck.dataset import HardDataset, SoftDataset
6
+ from ck.dataset.cross_table import CrossTable, cross_table_from_soft_dataset
7
+ from ck.pgm import RandomVariable
8
+ from ck.utils.np_extras import dtype_for_number_of_states
9
+
10
+
11
+ def dataset_from_cross_table(cross_table: CrossTable) -> HardDataset:
12
+ """
13
+ Construct a HardDataset from the given cross-table.
14
+
15
+ Args:
16
+ cross_table: A cross-table represented as a dictionary.
17
+
18
+ Returns:
19
+ A dataset where instances and instance weights are those of the
20
+ given cross-table.
21
+
22
+ Ensures:
23
+ `result.total_weight() == dataset.total_weight()`.
24
+ Zero weighted instances are not counted.
25
+ """
26
+ rvs: Sequence[RandomVariable] = cross_table.rvs
27
+
28
+ # Unzip the cross-table dictionary
29
+ rvs_series = [[] for _ in range(len(rvs))]
30
+ weights = []
31
+ for instance, weight in cross_table.items():
32
+ for series, state in zip(rvs_series, instance):
33
+ series.append(state)
34
+ weights.append(weight)
35
+
36
+ # Put the hard dataset together
37
+ return HardDataset(
38
+ data=(
39
+ (rv, np.array(series, dtype=dtype_for_number_of_states(len(rv))))
40
+ for rv, series in zip(rvs, rvs_series)
41
+ ),
42
+ weights=np.array(weights, dtype=np.float64),
43
+ )
44
+
45
+
46
+ def expand_soft_dataset(soft_dataset: SoftDataset) -> HardDataset:
47
+ """
48
+ Construct a hard dataset with the same data semantics as the given soft dataset
49
+ by expanding soft evidence.
50
+
51
+ Any state weights in `soft_dataset` that represents uncertainty over states
52
+ of a random variable will be converted to an equivalent set of weighted hard
53
+ instances. This means that the returned dataset may have a number of instances
54
+ different to that of the given soft dataset.
55
+
56
+ The ordering of instances in the returned dataset is not guaranteed.
57
+
58
+ This method works by constructing a cross-table from the given soft dataset,
59
+ then converting the crosstable to a hard dataset using `dataset_from_cross_table`.
60
+ This implies that the result will have no duplicated instances and no
61
+ instances with weight zero.
62
+ """
63
+ crosstab: CrossTable = cross_table_from_soft_dataset(soft_dataset)
64
+ return dataset_from_cross_table(crosstab)
@@ -0,0 +1,151 @@
1
+ from typing import Iterable, List, Sequence, Optional
2
+
3
+ from ck.dataset import HardDataset
4
+ from ck.pgm import RandomVariable
5
+
6
+
7
+ def hard_dataset_from_csv(
8
+ rvs: Iterable[RandomVariable],
9
+ lines: Iterable[str],
10
+ *,
11
+ weights: Optional[int | str] = None,
12
+ sep: Optional[str] = ',',
13
+ comment: str = '#',
14
+ ) -> HardDataset:
15
+ """
16
+ Interpret the given sequence of lines as CSV for a HardDataset.
17
+
18
+ Each line is a list of state indexes (ints) separated by `sep`.
19
+
20
+ Every line should have the same number of values.
21
+
22
+ If the first line contains a non-integer value, then the first
23
+ line will be interpreted as a header line.
24
+
25
+ If there is no header line, then the values will be interpreted in the
26
+ same order as `rvs` and the number of values on each line should be
27
+ the same as the number of random variables in `rvs`.
28
+
29
+ If there is a header line, then it will be interpreted as the order
30
+ of random variables. There must be a column name in the header to match
31
+ each name of the given random variables. Additional columns will be ignored.
32
+
33
+ Leading and trailing whitespace is ignored for each field, including header column names.
34
+
35
+ As text file (and StringIO) objects are iterable over lines, here is how to read a csv file:
36
+ ```
37
+ with open(csv_filename, 'r') as file:
38
+ hard_dataset_from_csv(rvs, file)
39
+ ```
40
+ Here is an example to read from a csv string:
41
+ ```
42
+ hard_dataset_from_csv(rvs, csv_string.splitlines())
43
+ ```
44
+
45
+ Args:
46
+ rvs: the random variables for the returned dataset.
47
+ lines: the sequence of lines to interpret, each line is an instance in the dataset.
48
+ weights: the column in the csv file holding instance weights. Can be either the
49
+ column number (counting from zero) or a column name (requires a header line).
50
+ sep: the string to use to separate values in a line, default is a comma.
51
+ If set to `None`, lines will be split on any consecutive run of whitespace characters
52
+ (including \n \r \t \f and spaces).
53
+ comment: text starting with this will be treated as a comment. Set to '' to disallow comments.
54
+
55
+ Returns:
56
+ a HardDataset.
57
+
58
+ Raises:
59
+ ValueError: if the lines do not conform to a CSV format.
60
+ """
61
+ rvs: Sequence[RandomVariable] = tuple(rvs)
62
+
63
+ # Define `clean_line` being sensitive to comments.
64
+ if len(comment) > 0:
65
+ def clean_line(l: str) -> str:
66
+ i = l.find(comment)
67
+ if i >= 0:
68
+ l = l[:i]
69
+ return l.strip()
70
+ else:
71
+ def clean_line(l: str) -> str:
72
+ return l.strip()
73
+
74
+ # Get the first line which may be a header line or data line
75
+ it = iter(lines)
76
+ try:
77
+ while True:
78
+ line = clean_line(next(it))
79
+ if len(line) > 0:
80
+ break
81
+ except StopIteration:
82
+ # Empty dataset with the given random variables
83
+ return HardDataset((rv, []) for rv in rvs)
84
+
85
+ values: List[str] = [value.strip() for value in line.split(sep)]
86
+ number_of_columns: int = len(values)
87
+ series: List[List[int]] # series[dataset-column] = list of values
88
+ weight_series: Optional[List[float]] = None
89
+ column_map: List[int] # column_map[dataset-column] = input-column
90
+ if all(_is_number(value) for value in values):
91
+ # First line is not a header line
92
+ if weights is None:
93
+ if number_of_columns != len(rvs):
94
+ raise ValueError('number of columns does not match number of random variables')
95
+ column_map = list(range(len(rvs)))
96
+ else:
97
+ if number_of_columns != len(rvs) + 1:
98
+ raise ValueError('number of columns does not match number of random variables and weight column')
99
+ if not isinstance(weights, int):
100
+ raise ValueError('no header detected - `weights` must be a column number')
101
+ if not (-number_of_columns <= weights < number_of_columns):
102
+ raise ValueError('`weights` column number out of range')
103
+ column_map = list(range(len(rvs) + 1))
104
+ column_map.pop(weights)
105
+
106
+ # Initialise series with the first line of data
107
+ series = [[int(values[i])] for i in column_map]
108
+ if weights is not None:
109
+ weight_series = [float(values[weights])]
110
+
111
+ else:
112
+ # First line is a header line
113
+ # Lookup each random variable to find its column
114
+ column_map = [
115
+ values.index(rv.name) # will raise ValueError if not found
116
+ for rv in rvs
117
+ ]
118
+ if isinstance(weights, str):
119
+ # Convert weights column name to column number
120
+ weights: int = values.index(weights) # will raise ValueError if not found
121
+ elif isinstance(weights, int) and not (number_of_columns <= weights < number_of_columns):
122
+ raise ValueError('`weights` column number out of range')
123
+
124
+ # Initialise each series as empty
125
+ series = [[] for _ in rvs]
126
+ if weights is not None:
127
+ weight_series = []
128
+
129
+ # Read remaining data lines
130
+ for line in it:
131
+ line = clean_line(line)
132
+ if len(line) == 0:
133
+ continue
134
+ if len(values) != number_of_columns:
135
+ raise ValueError('number of values does not match number of columns')
136
+ values = line.split(sep)
137
+ for series_i, i in zip(series, column_map):
138
+ series_i.append(int(values[i]))
139
+ if weights is not None:
140
+ weight_series.append(float(values[weights]))
141
+
142
+ # Construct the dataset
143
+ return HardDataset(zip(rvs, series), weights=weight_series)
144
+
145
+
146
+ def _is_number(s: str) -> bool:
147
+ try:
148
+ float(s)
149
+ return True
150
+ except ValueError:
151
+ return False
@@ -0,0 +1,96 @@
1
+ import random
2
+ from dataclasses import dataclass
3
+ from typing import Sequence, List, Iterator, Tuple, Dict
4
+
5
+ import numpy as np
6
+
7
+ from ck.dataset import HardDataset
8
+ from ck.dataset.cross_table import CrossTable
9
+ from ck.pgm import RandomVariable, Instance
10
+ from ck.sampling.sampler import Sampler
11
+ from ck.utils.np_extras import dtype_for_number_of_states, NDArray
12
+ from ck.utils.random_extras import Random
13
+
14
+
15
+ def dataset_from_sampler(sampler: Sampler, length: int) -> HardDataset:
16
+ """
17
+ Create a hard dataset using samples from a sampler.
18
+
19
+ Args:
20
+ sampler: A sampler which defined the random variables and provides samples.
21
+ length: The length of the dataset to create.
22
+
23
+ Returns:
24
+ A HardDataset of the given length.
25
+ """
26
+ rvs: Sequence[RandomVariable] = sampler.rvs
27
+ columns: List[NDArray] = [
28
+ np.zeros(length, dtype=dtype_for_number_of_states(len(rv)))
29
+ for rv in rvs
30
+ ]
31
+ for i, instance in enumerate(sampler.take(length)):
32
+ for column, state in zip(columns, instance):
33
+ column[i] = state
34
+ return HardDataset(zip(rvs, columns))
35
+
36
+
37
+ class CrossTableSampler(Sampler):
38
+ def __init__(self, crosstab: CrossTable, rand: Random = random):
39
+ """
40
+ Adapt a cross table to a sampler.
41
+
42
+ Instances will be drawn from the sampler according to their
43
+ weight in the given cross-table. If the given cross-table is
44
+ modified after constructing the sampler, the sampler will not
45
+ be affected.
46
+ """
47
+ if len(crosstab) == 0:
48
+ raise ValueError('no instances to sample')
49
+
50
+ super().__init__(rvs=crosstab.rvs, condition=())
51
+
52
+ # Group instances by weight.
53
+ # We do this in anticipation that it makes sampling more efficient.
54
+ weight_groups: Dict[float, _WeightGroup] = {}
55
+ for instance, weight in crosstab.items():
56
+ weight_group = weight_groups.get(weight)
57
+ if weight_group is None:
58
+ weight_groups[weight] = _WeightGroup(weight, weight, [instance])
59
+ else:
60
+ weight_group.append(instance)
61
+
62
+ self._weight_groups: List[_WeightGroup] = list(weight_groups.values())
63
+ self._total_weight = sum(group.total for group in weight_groups.values())
64
+ self._rand = rand
65
+
66
+ def __iter__(self) -> Iterator[Instance]:
67
+ while True:
68
+ # This code performs inverse transform sampling
69
+ r: float = self._rand.random() * self._total_weight
70
+
71
+ # This does a serial search to find the weight group.
72
+ # This is efficient for small numbers of groups, but this may be
73
+ # improved for large numbers of groups.
74
+ it = iter(self._weight_groups)
75
+ group = next(it)
76
+ while r >= group.total:
77
+ r -= group.total
78
+ group = next(it)
79
+
80
+ # Pick an instance in the group
81
+ i = int(r / group.weight)
82
+ yield group.instances[i]
83
+
84
+
85
+ @dataclass
86
+ class _WeightGroup:
87
+ """
88
+ Support for CrossTableSampler.
89
+ """
90
+ weight: float
91
+ total: float
92
+ instances: List[Tuple[int, ...]]
93
+
94
+ def append(self, instance: Tuple[int, ...]) -> None:
95
+ self.total += self.weight
96
+ self.instances.append(instance)
File without changes