compiled-knowledge 4.1.0a1__cp313-cp313-win32.whl → 4.1.0a3__cp313-cp313-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of compiled-knowledge might be problematic. Click here for more details.

Files changed (36) hide show
  1. ck/circuit/_circuit_cy.c +1 -1
  2. ck/circuit/_circuit_cy.cp313-win32.pyd +0 -0
  3. ck/circuit_compiler/cython_vm_compiler/_compiler.c +152 -152
  4. ck/circuit_compiler/cython_vm_compiler/_compiler.cp313-win32.pyd +0 -0
  5. ck/circuit_compiler/support/circuit_analyser/_circuit_analyser_cy.c +1 -1
  6. ck/circuit_compiler/support/circuit_analyser/_circuit_analyser_cy.cp313-win32.pyd +0 -0
  7. ck/dataset/cross_table.py +143 -79
  8. ck/dataset/dataset.py +143 -38
  9. ck/dataset/dataset_builder.py +519 -0
  10. ck/dataset/dataset_from_crosstable.py +21 -2
  11. ck/dataset/dataset_from_csv.py +5 -1
  12. ck/learning/coalesce_cross_tables.py +395 -0
  13. ck/learning/model_from_cross_tables.py +242 -0
  14. ck/learning/parameters.py +117 -0
  15. ck/learning/train_generative_bn.py +198 -0
  16. ck/pgm.py +10 -8
  17. ck/pgm_circuit/marginals_program.py +5 -0
  18. ck/pgm_circuit/wmc_program.py +5 -0
  19. ck/pgm_compiler/support/circuit_table/_circuit_table_cy.c +1 -1
  20. ck/pgm_compiler/support/circuit_table/_circuit_table_cy.cp313-win32.pyd +0 -0
  21. ck/probability/divergence.py +226 -0
  22. ck/probability/probability_space.py +43 -19
  23. ck_demos/dataset/__init__.py +0 -0
  24. ck_demos/dataset/demo_dataset_builder.py +37 -0
  25. ck_demos/dataset/demo_dataset_from_sampler.py +18 -0
  26. ck_demos/learning/__init__.py +0 -0
  27. ck_demos/learning/demo_bayesian_network_from_cross_tables.py +71 -0
  28. ck_demos/learning/demo_simple_learning.py +55 -0
  29. ck_demos/sampling/demo_wmc_direct_sampler.py +2 -2
  30. {compiled_knowledge-4.1.0a1.dist-info → compiled_knowledge-4.1.0a3.dist-info}/METADATA +2 -1
  31. {compiled_knowledge-4.1.0a1.dist-info → compiled_knowledge-4.1.0a3.dist-info}/RECORD +35 -24
  32. ck/learning/train_generative.py +0 -149
  33. /ck/{dataset/cross_table_probabilities.py → probability/cross_table_probability_space.py} +0 -0
  34. {compiled_knowledge-4.1.0a1.dist-info → compiled_knowledge-4.1.0a3.dist-info}/WHEEL +0 -0
  35. {compiled_knowledge-4.1.0a1.dist-info → compiled_knowledge-4.1.0a3.dist-info}/licenses/LICENSE.txt +0 -0
  36. {compiled_knowledge-4.1.0a1.dist-info → compiled_knowledge-4.1.0a3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,519 @@
1
+ from __future__ import annotations
2
+
3
+ from itertools import count
4
+ from typing import Iterable, List, TypeAlias, Sequence, overload, Set, Tuple, MutableSequence, Dict, Optional, \
5
+ assert_never
6
+
7
+ import numpy as np
8
+
9
+ from ck.dataset import HardDataset, SoftDataset
10
+ from ck.pgm import RandomVariable, State
11
+ from ck.utils.np_extras import NDArrayFloat64, NDArrayStates, dtype_for_number_of_states, DTypeStates, NDArrayNumeric
12
+
13
+ HardValue: TypeAlias = int
14
+ SoftValue: TypeAlias = Sequence[float]
15
+ Value: TypeAlias = HardValue | SoftValue | None
16
+
17
+
18
+ class Record(Sequence[Value]):
19
+ """
20
+ A record is a sequence of values, co-indexed with dataset columns.
21
+
22
+ A value is either a state index (HardValue), a sequence of state
23
+ weights (SoftValue), or missing (None).
24
+ """
25
+
26
+ def __init__(self, dataset: DatasetBuilder, values: Optional[Iterable[Value]] = None):
27
+ self.weight: float = 1
28
+ self._dataset: DatasetBuilder = dataset
29
+ self._values: List[Value] = [] if values is None else list(values)
30
+
31
+ def __len__(self) -> int:
32
+ return len(self._dataset.rvs)
33
+
34
+ @overload
35
+ def __getitem__(self, index: int | RandomVariable) -> Value:
36
+ ...
37
+
38
+ @overload
39
+ def __getitem__(self, index: slice) -> Sequence[Value]:
40
+ ...
41
+
42
+ def __getitem__(self, index):
43
+ if isinstance(index, slice):
44
+ return [self._getitem(i) for i in range(*index.indices(len(self)))]
45
+ if isinstance(index, RandomVariable):
46
+ # noinspection PyProtectedMember
47
+ return self._getitem(self._dataset._rvs_index[index])
48
+
49
+ size = len(self)
50
+ if index < 0:
51
+ index += size
52
+ if not 0 <= index < size:
53
+ raise IndexError('index out of range')
54
+ return self._getitem(index)
55
+
56
+ def _getitem(self, index: int) -> Value:
57
+ """
58
+ Assumes:
59
+ 0 <= index < len(self).
60
+ """
61
+ if index >= len(self._values):
62
+ return None
63
+ return self._values[index]
64
+
65
+ @overload
66
+ def __setitem__(self, index: int | RandomVariable, value: Value) -> None:
67
+ ...
68
+
69
+ @overload
70
+ def __setitem__(self, index: slice, value: Iterable[Value]) -> None:
71
+ ...
72
+
73
+ def __setitem__(self, index, value):
74
+ if isinstance(index, slice):
75
+ for i, v in zip(range(*index.indices(len(self))), value):
76
+ self._setitem(i, v)
77
+ return
78
+ if isinstance(index, RandomVariable):
79
+ # noinspection PyProtectedMember
80
+ self._setitem(self._dataset._rvs_index[index], value)
81
+ return
82
+
83
+ size = len(self)
84
+ if index < 0:
85
+ index += size
86
+ if not 0 <= index < size:
87
+ raise IndexError('index out of range')
88
+ self._setitem(index, value)
89
+
90
+ def _setitem(self, index: int, value: Value) -> None:
91
+ """
92
+ Assumes:
93
+ 0 <= index < len(self).
94
+ """
95
+ to_append: int = index + 1 - len(self._values)
96
+ self._values += [None] * to_append
97
+
98
+ if value is None:
99
+ self._values[index] = None
100
+ return
101
+
102
+ rv: RandomVariable = self._dataset.rvs[index]
103
+ if isinstance(value, int):
104
+ if not (0 <= value < len(rv)):
105
+ raise ValueError(f'state index out of range, expected: 0 <= {value!r} < {len(rv)}')
106
+ self._values[index] = value
107
+ return
108
+
109
+ # Expect the value is a sequence of floats
110
+ if len(value) != len(rv):
111
+ raise ValueError(f'state weights incorrect length, expected: {len(rv)}, got: {len(value)}')
112
+ self._values[index] = tuple(value)
113
+
114
+ def set(self, *values: Value) -> None:
115
+ """
116
+ Set all the values of this record, using state indexes or state weights.
117
+
118
+ If insufficient or additional values are provided, a ValueError will be raised.
119
+ """
120
+ if len(values) != len(self):
121
+ raise ValueError('incorrect number of values provided')
122
+ for i, value in enumerate(values):
123
+ self._setitem(i, value)
124
+
125
+ def set_states(self, *values: State) -> None:
126
+ """
127
+ Set all the values of this record from random variable states.
128
+
129
+ State indexes are resolved using `RandomVariable.state_idx`.
130
+ If insufficient or additional values are provided, a ValueError will be raised.
131
+ """
132
+ rvs = self._dataset.rvs
133
+ if len(values) != len(rvs):
134
+ raise ValueError('incorrect number of values provided')
135
+ for i, rv, value in zip(count(), rvs, values):
136
+ self._setitem(i, rv.state_idx(value))
137
+
138
+ def __str__(self) -> str:
139
+ return self.to_str()
140
+
141
+ def to_str(
142
+ self,
143
+ *,
144
+ show_weight: bool = True,
145
+ as_states: bool = False,
146
+ missing: str = 'None',
147
+ sep: str = ', ',
148
+ ) -> str:
149
+ """
150
+ Render the record as a human-readable string.
151
+ If as_states is true, then hard values states are dumped instead of just state indexes.
152
+
153
+ Args:
154
+ show_weight: If `True`, the instance weight is included.
155
+ as_states: If `True`, the states are used instead of just state indexes.
156
+ missing: the string to use for missing values.
157
+ sep: the string to use for separating values.
158
+ """
159
+
160
+ def _value_str(rv_idx: int, v: Value) -> str:
161
+ if v is None:
162
+ return missing
163
+ if isinstance(v, int):
164
+ if as_states:
165
+ return repr(self._dataset.rvs[rv_idx].states[v])
166
+ else:
167
+ return str(v)
168
+ else:
169
+ return str(v)
170
+
171
+ instance_str = sep.join(_value_str(i, self._getitem(i)) for i in range(len(self)))
172
+ if show_weight:
173
+ return f'({instance_str}) * {self.weight}'
174
+ else:
175
+ return f'({instance_str})'
176
+
177
+
178
+ class DatasetBuilder(Sequence[Record]):
179
+ """
180
+ A dataset builder can be used for making a hard or soft dataset, incrementally growing
181
+ the dataset as needed. This represents a flexible but inefficient interim representation of data.
182
+ """
183
+
184
+ def __init__(self, rvs: Iterable[RandomVariable] = ()):
185
+ """
186
+ Args:
187
+ rvs: Optional random variables to include in the dataset. Default is no random variables.
188
+ """
189
+ self._rvs: Tuple[RandomVariable, ...] = ()
190
+ self._rvs_index: Dict[RandomVariable, int] = {}
191
+ self._records: List[Record] = []
192
+ self.new_column(*rvs)
193
+
194
+ @property
195
+ def rvs(self) -> Sequence[RandomVariable]:
196
+ return self._rvs
197
+
198
+ def new_column(self, *rv: RandomVariable) -> None:
199
+ """
200
+ Adds one, or more, new random variables to the dataset. For existing rows,
201
+ value for the new random variable will be `None`.
202
+
203
+ Args:
204
+ rv: a new random variable to include in the dataset.
205
+
206
+ Raises:
207
+ ValueError: if the given random variable already exists in the dataset.
208
+ """
209
+ # Do all consistency checks first to fail early, before modifying the dataset.
210
+ rvs_to_add: Set[RandomVariable] = set(rv)
211
+ if len(rvs_to_add) != len(rv):
212
+ raise ValueError(f'request to add a column includes duplicates')
213
+ duplicate_rvs: Set[RandomVariable] = rvs_to_add.intersection(self._rvs_index.keys())
214
+ if len(duplicate_rvs) > 0:
215
+ duplicate_rv_names = ', '.join(rv.name for rv in duplicate_rvs)
216
+ raise ValueError(f'column already exists in the dataset: {duplicate_rv_names}')
217
+
218
+ for rv in rvs_to_add:
219
+ self._rvs_index[rv] = len(self._rvs)
220
+ self._rvs += (rv,)
221
+
222
+ def ensure_column(self, *rv: RandomVariable) -> None:
223
+ """
224
+ Add a column for one, or more, random variables, only
225
+ adding a random variable if it is not already present in the dataset.
226
+ """
227
+ all_rvs = self._rvs_index.keys()
228
+ self.new_column(*(_rv for _rv in rv if _rv not in all_rvs))
229
+
230
+ def del_column(self, *rv: RandomVariable) -> None:
231
+ """
232
+ Delete one, or more, random variables from the dataset.
233
+
234
+ Args:
235
+ rv: a random variable to remove from the dataset.
236
+
237
+ Raises:
238
+ ValueError: if the given random variable does not exist in the dataset.
239
+ """
240
+ # Do all consistency checks first to fail early, before modifying the dataset.
241
+ rvs_to_del: Set[RandomVariable] = set(rv)
242
+ if len(rvs_to_del) != len(rv):
243
+ raise ValueError(f'request to delete a column includes duplicates')
244
+ missing_columns = rvs_to_del.difference(self._rvs_index.keys())
245
+ if len(missing_columns) > 0:
246
+ missing_rv_names = ', '.join(rv.name for rv in missing_columns)
247
+ raise ValueError(f'missing columns: {missing_rv_names}')
248
+
249
+ # Get column indices to remove, in descending order
250
+ indices = sorted((self._rvs_index[rv] for rv in rvs_to_del), reverse=True)
251
+
252
+ # Remove from the index
253
+ for rv in rvs_to_del:
254
+ self._rvs_index.pop(rv)
255
+
256
+ # Remove from column sequence
257
+ rvs_list: List[RandomVariable] = list(self._rvs)
258
+ for i in indices:
259
+ rvs_list.pop(i)
260
+ self._rvs = tuple(rvs_list)
261
+
262
+ # Remove from records
263
+ for record in self._records:
264
+ # noinspection PyProtectedMember
265
+ record_values: List[Value] = record._values
266
+ for i in indices:
267
+ if i < len(record_values):
268
+ record_values.pop(i)
269
+
270
+ def total_weight(self) -> float:
271
+ """
272
+ Calculate the total weight of this dataset.
273
+ """
274
+ return sum(record.weight for record in self._records)
275
+
276
+ def get_weights(self) -> NDArrayFloat64:
277
+ """
278
+ Allocate and return a 1D numpy array of instance weights.
279
+
280
+ Ensures:
281
+ shape of the result == `(len(self), )`.
282
+ """
283
+ result: NDArrayStates = np.fromiter(
284
+ (record.weight for record in self._records),
285
+ count=len(self._records),
286
+ dtype=np.float64,
287
+ )
288
+ return result
289
+
290
+ def get_column_hard(self, rv: RandomVariable, *, missing: Optional[int] = None) -> NDArrayStates:
291
+ """
292
+ Allocate and return a 1D numpy array of state indexes.
293
+
294
+ The state of a random variable (for an instance) where the value is soft evidence,
295
+ is the state with the maximum weight. Ties are broken arbitrarily.
296
+
297
+ Args:
298
+ rv: a random variable in this dataset.
299
+ missing: the value to use in the result to represent missing values. If not provided,
300
+ then the default missing value is len(rv), which is an invalid state index.
301
+
302
+ Raises:
303
+ ValueError: if the supplied missing value is negative.
304
+
305
+ Ensures:
306
+ shape of the result == `(len(self), )`.
307
+ """
308
+ index: int = self._rvs_index[rv]
309
+ if missing is None:
310
+ missing = len(rv)
311
+ if missing < 0:
312
+ raise ValueError(f'missing value must be >= 0')
313
+ number_of_states = max(len(rv), missing + 1)
314
+ dtype: DTypeStates = dtype_for_number_of_states(number_of_states)
315
+ result: NDArrayStates = np.fromiter(
316
+ (_get_state(record[index], missing) for record in self._records),
317
+ count=len(self._records),
318
+ dtype=dtype,
319
+ )
320
+ return result
321
+
322
+ def get_column_soft(self, rv: RandomVariable, *, missing: float | Sequence[float] = np.nan) -> NDArrayFloat64:
323
+ """
324
+ Allocate and return a numpy array of state weights.
325
+
326
+ Args:
327
+ rv: a random variable in this dataset.
328
+ missing: the value to use in the result to represent missing values. Default is all NaN.
329
+
330
+ Ensures:
331
+ shape of the result == `(len(self), len(rv))`.
332
+ """
333
+ index: int = self._rvs_index[rv]
334
+ size: int = len(rv)
335
+
336
+ if isinstance(missing, (float, int)):
337
+ missing_weights: NDArrayFloat64 = np.array([missing] * size, dtype=np.float64)
338
+ else:
339
+ missing_weights: NDArrayFloat64 = np.array(missing, dtype=np.float64)
340
+ if missing_weights.shape != (size,):
341
+ raise ValueError(f'missing weights shape expected {(size,)}, but got {missing_weights.shape}')
342
+
343
+ result: NDArrayFloat64 = np.empty(shape=(len(self._records), size), dtype=np.float64)
344
+ for i, record in enumerate(self._records):
345
+ result[i, :] = _get_state_weights(size, record[index], missing_weights)
346
+ return result
347
+
348
+ def append(self, *values: Value) -> Record:
349
+ """
350
+ Appends a new record to the dataset.
351
+
352
+ Args:
353
+ values: the new record to append. If omitted, a new record will be created
354
+ with all values missing (`None`).
355
+
356
+ Returns:
357
+ the new record.
358
+ """
359
+ record = Record(self, values)
360
+ self._records.append(record)
361
+ return record
362
+
363
+ def insert(self, index: int, values: Optional[Iterable[Value]] = None) -> Record:
364
+ """
365
+ Inserts a new record to the dataset at the given index.
366
+
367
+ Args:
368
+ index: where to insert the record (interpreted as per builtin `list.insert`).
369
+ values: the new record to append. If omitted, a new record will be created
370
+ with all values missing (`None`).
371
+
372
+ Returns:
373
+ the new record.
374
+ """
375
+ record = Record(self, values)
376
+ self._records.insert(index, record)
377
+ return record
378
+
379
+ def append_dataset(self, dataset: HardDataset | SoftDataset) -> None:
380
+ """
381
+ Append all the records of the given dataset to this dataset builder.
382
+
383
+ Args:
384
+ dataset: the dataset of records to append.
385
+
386
+ Raises:
387
+ KeyError: if `dataset.rvs` is not a superset of `this.rvs` and ensure_cols is false.
388
+ If you want to avoid this error, first call `self.ensure_column(*dataset.rvs)`.
389
+ """
390
+ if isinstance(dataset, HardDataset):
391
+ cols: Tuple = tuple(dataset.state_idxs(rv).tolist() for rv in self.rvs)
392
+ elif isinstance(dataset, SoftDataset):
393
+ cols: Tuple = tuple(dataset.state_weights(rv) for rv in self.rvs)
394
+ else:
395
+ assert_never('not reached')
396
+ weights: NDArrayNumeric = dataset.weights
397
+ for weight, vals in zip(weights, zip(*cols)):
398
+ self.append(*vals).weight = weight
399
+
400
+ @overload
401
+ def __getitem__(self, index: int) -> Record:
402
+ ...
403
+
404
+ @overload
405
+ def __getitem__(self, index: slice) -> MutableSequence[Record]:
406
+ ...
407
+
408
+ def __getitem__(self, index):
409
+ return self._records[index]
410
+
411
+ def __delitem__(self, index: int | slice) -> None:
412
+ del self._records[index]
413
+
414
+ def __len__(self) -> int:
415
+ return len(self._records)
416
+
417
+ def dump(
418
+ self,
419
+ *,
420
+ show_rvs: bool = True,
421
+ show_weights: bool = True,
422
+ as_states: bool = False,
423
+ missing: str = 'None',
424
+ sep: str = ', ',
425
+ ) -> None:
426
+ """
427
+ Dump the dataset in a human-readable format.
428
+ If as_states is true, then hard values states are dumped instead of just state indexes.
429
+
430
+ Args:
431
+ show_rvs: If `True`, the random variables are dumped.
432
+ show_weights: If `True`, the instance weights are dumped.
433
+ as_states: If `True`, the states are dumped instead of just state indexes.
434
+ missing: the string to use for missing values.
435
+ sep: the string to use for separating values.
436
+ """
437
+ if show_rvs:
438
+ rvs = ', '.join(str(rv) for rv in self.rvs)
439
+ print(f'rvs: [{rvs}]')
440
+ print(f'instances ({len(self)}, with total weight {self.total_weight()}):')
441
+ for record in self._records:
442
+ print(record.to_str(show_weight=show_weights, as_states=as_states, missing=missing, sep=sep))
443
+
444
+
445
+ def hard_dataset_from_builder(dataset_builder: DatasetBuilder, *, missing: Optional[int] = None) -> HardDataset:
446
+ """
447
+ Create a hard dataset from a soft dataset by repeated application
448
+ of `HardDataset.add_rv_from_state_idxs` using values from `self.get_column_hard`.
449
+
450
+ The state of a random variable (for an instance) where the value is soft evidence,
451
+ is the state with the maximum weight. Ties are broken arbitrarily.
452
+
453
+ The instance weights of the returned dataset will simply
454
+ be the weights from the builder.
455
+
456
+ No adjustments are made to the resulting dataset weights, even if
457
+ a value in the dataset builder is soft evidence that does not sum to
458
+ one.
459
+
460
+ Args:
461
+ dataset_builder: The dataset builder providing random variables,
462
+ their states, and instance weights.
463
+ missing: the value to use in the result to represent missing values. If not provided,
464
+ then the default missing value is len(rv) for each rv, which is an invalid state index.
465
+
466
+ Returns:
467
+ A `HardDataset` instance.
468
+ """
469
+ dataset = HardDataset(weights=dataset_builder.get_weights())
470
+ for rv in dataset_builder.rvs:
471
+ dataset.add_rv_from_state_idxs(rv, dataset_builder.get_column_hard(rv, missing=missing))
472
+ return dataset
473
+
474
+
475
+ def soft_dataset_from_builder(
476
+ dataset_builder: DatasetBuilder,
477
+ *,
478
+ missing: float | Sequence[float] = np.nan,
479
+ ) -> SoftDataset:
480
+ """
481
+ Create a soft dataset from a hard dataset by repeated application
482
+ of `SoftDataset.add_rv_from_state_idxs`.
483
+
484
+ The instance weights of the returned dataset will be a copy
485
+ of the instance weights of the hard dataset.
486
+
487
+ Args:
488
+ dataset_builder: The dataset builder providing random variables,
489
+ their state weights, and instance weights.
490
+ missing: the value to use in the result to represent missing values.
491
+ If a single float is provided, all state weights will have that value. Alternatively,
492
+ a sequence of state weights can be provided, but all random variables will need
493
+ to be the same size. Default is all state weights set to NaN.
494
+
495
+ Returns:
496
+ A `SoftDataset` instance.
497
+ """
498
+ dataset = SoftDataset(weights=dataset_builder.get_weights())
499
+ for rv in dataset_builder.rvs:
500
+ dataset.add_rv_from_state_weights(rv, dataset_builder.get_column_soft(rv, missing=missing))
501
+ return dataset
502
+
503
+
504
+ def _get_state(value: Value, missing: int) -> int:
505
+ if value is None:
506
+ return missing
507
+ if isinstance(value, int):
508
+ return value
509
+ return np.argmax(value).item()
510
+
511
+
512
+ def _get_state_weights(size: int, value: Value, missing: Sequence[float]) -> Sequence[float]:
513
+ if value is None:
514
+ return missing
515
+ if isinstance(value, int):
516
+ result = np.zeros(size, dtype=np.float64)
517
+ result[value] = 1
518
+ return result
519
+ return value
@@ -2,8 +2,8 @@ from typing import Sequence
2
2
 
3
3
  import numpy as np
4
4
 
5
- from ck.dataset import HardDataset
6
- from ck.dataset.cross_table import CrossTable
5
+ from ck.dataset import HardDataset, SoftDataset
6
+ from ck.dataset.cross_table import CrossTable, cross_table_from_soft_dataset
7
7
  from ck.pgm import RandomVariable
8
8
  from ck.utils.np_extras import dtype_for_number_of_states
9
9
 
@@ -43,3 +43,22 @@ def dataset_from_cross_table(cross_table: CrossTable) -> HardDataset:
43
43
  )
44
44
 
45
45
 
46
+ def expand_soft_dataset(soft_dataset: SoftDataset) -> HardDataset:
47
+ """
48
+ Construct a hard dataset with the same data semantics as the given soft dataset
49
+ by expanding soft evidence.
50
+
51
+ Any state weights in `soft_dataset` that represents uncertainty over states
52
+ of a random variable will be converted to an equivalent set of weighted hard
53
+ instances. This means that the returned dataset may have a number of instances
54
+ different to that of the given soft dataset.
55
+
56
+ The ordering of instances in the returned dataset is not guaranteed.
57
+
58
+ This method works by constructing a cross-table from the given soft dataset,
59
+ then converting the crosstable to a hard dataset using `dataset_from_cross_table`.
60
+ This implies that the result will have no duplicated instances and no
61
+ instances with weight zero.
62
+ """
63
+ crosstab: CrossTable = cross_table_from_soft_dataset(soft_dataset)
64
+ return dataset_from_cross_table(crosstab)
@@ -9,7 +9,7 @@ def hard_dataset_from_csv(
9
9
  lines: Iterable[str],
10
10
  *,
11
11
  weights: Optional[int | str] = None,
12
- sep: str = ',',
12
+ sep: Optional[str] = ',',
13
13
  comment: str = '#',
14
14
  ) -> HardDataset:
15
15
  """
@@ -30,6 +30,8 @@ def hard_dataset_from_csv(
30
30
  of random variables. There must be a column name in the header to match
31
31
  each name of the given random variables. Additional columns will be ignored.
32
32
 
33
+ Leading and trailing whitespace is ignored for each field, including header column names.
34
+
33
35
  As text file (and StringIO) objects are iterable over lines, here is how to read a csv file:
34
36
  ```
35
37
  with open(csv_filename, 'r') as file:
@@ -46,6 +48,8 @@ def hard_dataset_from_csv(
46
48
  weights: the column in the csv file holding instance weights. Can be either the
47
49
  column number (counting from zero) or a column name (requires a header line).
48
50
  sep: the string to use to separate values in a line, default is a comma.
51
+ If set to `None`, lines will be split on any consecutive run of whitespace characters
52
+ (including \n \r \t \f and spaces).
49
53
  comment: text starting with this will be treated as a comment. Set to '' to disallow comments.
50
54
 
51
55
  Returns: