edgepython 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
edgepython/__init__.py ADDED
@@ -0,0 +1,114 @@
1
+ # This code was written by Claude (Anthropic). The project was directed by Lior Pachter.
2
+ """
3
+ edgePython: Python port of the edgeR Bioconductor package.
4
+
5
+ Empirical analysis of digital gene expression data in Python.
6
+ """
7
+
8
+ __version__ = "0.1.0"
9
+
10
+ # --- Classes ---
11
+ from .classes import DGEList, DGEExact, DGEGLM, DGELRT, TopTags
12
+ from .classes import cbind_dgelist, rbind_dgelist
13
+ from .compressed_matrix import CompressedMatrix
14
+
15
+ # --- DGEList construction & accessors ---
16
+ from .dgelist import (
17
+ make_dgelist,
18
+ valid_dgelist,
19
+ get_counts,
20
+ get_dispersion,
21
+ get_dispersion_type,
22
+ get_offset,
23
+ get_norm_lib_sizes,
24
+ )
25
+
26
+ # --- Normalization ---
27
+ from .normalization import calc_norm_factors, normalize_chip_to_input, calc_norm_offsets_for_chip
28
+
29
+ # --- Expression ---
30
+ from .expression import cpm, rpkm, tpm, ave_log_cpm, cpm_by_group, rpkm_by_group
31
+
32
+ # --- Filtering ---
33
+ from .filtering import filter_by_expr
34
+
35
+ # --- Dispersion estimation ---
36
+ from .dispersion import (
37
+ estimate_disp,
38
+ WLEB,
39
+ estimate_common_disp,
40
+ estimate_tagwise_disp,
41
+ estimate_trended_disp,
42
+ estimate_glm_common_disp,
43
+ estimate_glm_trended_disp,
44
+ estimate_glm_tagwise_disp,
45
+ )
46
+
47
+ # --- GLM fitting ---
48
+ from .glm_fit import glm_fit, glm_ql_fit, mglm_one_group, mglm_one_way
49
+
50
+ # --- GLM testing ---
51
+ from .glm_test import glm_lrt, glm_ql_ftest, glm_treat
52
+
53
+ # --- Exact test ---
54
+ from .exact_test import (
55
+ exact_test,
56
+ exact_test_double_tail,
57
+ equalize_lib_sizes,
58
+ q2q_nbinom,
59
+ split_into_groups,
60
+ )
61
+
62
+ # --- Results ---
63
+ from .results import top_tags, decide_tests
64
+
65
+ # --- I/O ---
66
+ from .io import (
67
+ read_data,
68
+ to_anndata,
69
+ read_dge,
70
+ read_10x,
71
+ catch_salmon,
72
+ catch_kallisto,
73
+ catch_rsem,
74
+ feature_counts_to_dgelist,
75
+ read_bismark2dge,
76
+ seurat_to_pb,
77
+ )
78
+
79
+ # --- Visualization ---
80
+ from .visualization import (
81
+ plot_md,
82
+ plot_bcv,
83
+ plot_mds,
84
+ plot_smear,
85
+ plot_ql_disp,
86
+ ma_plot,
87
+ gof,
88
+ )
89
+
90
+ # --- Splicing ---
91
+ from .splicing import diff_splice, diff_splice_dge, splice_variants
92
+
93
+ # --- Gene sets ---
94
+ from .gene_sets import camera, fry, roast, mroast, romer, goana, kegga
95
+
96
+ # --- Utilities ---
97
+ from .utils import (
98
+ model_matrix,
99
+ model_matrix_meth,
100
+ nearest_tss,
101
+ add_prior_count,
102
+ pred_fc,
103
+ good_turing,
104
+ thin_counts,
105
+ gini,
106
+ sum_tech_reps,
107
+ zscore_nbinom,
108
+ )
109
+
110
+ # --- Single-cell GLM ---
111
+ from .sc_fit import glm_sc_fit, glm_sc_test, shrink_sc_disp
112
+
113
+ # --- limma utilities ---
114
+ from .limma_port import squeeze_var
edgepython/classes.py ADDED
@@ -0,0 +1,517 @@
1
+ # This code was written by Claude (Anthropic). The project was directed by Lior Pachter.
2
+ """
3
+ Core data classes for edgePython.
4
+
5
+ Port of edgeR's S4 classes (DGEList, DGEExact, DGEGLM, DGELRT, TopTags)
6
+ as Python dataclasses with dict-like access, subsetting, and display.
7
+ """
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+ from copy import deepcopy
12
+
13
+
14
+ class _EdgeRBase(dict):
15
+ """Base class providing dict-like access, subsetting, and display."""
16
+
17
+ def __getattr__(self, name):
18
+ try:
19
+ return self[name]
20
+ except KeyError:
21
+ raise AttributeError(f"'{type(self).__name__}' has no attribute '{name}'")
22
+
23
+ def __setattr__(self, name, value):
24
+ self[name] = value
25
+
26
+ def __delattr__(self, name):
27
+ try:
28
+ del self[name]
29
+ except KeyError:
30
+ raise AttributeError(f"'{type(self).__name__}' has no attribute '{name}'")
31
+
32
+ @property
33
+ def shape(self):
34
+ if 'counts' in self:
35
+ return self['counts'].shape
36
+ return None
37
+
38
+ def __repr__(self):
39
+ cls = type(self).__name__
40
+ components = list(self.keys())
41
+ s = self.shape
42
+ if s is not None:
43
+ return f"{cls} with {s[0]} rows and {s[1]} columns\nComponents: {', '.join(components)}"
44
+ return f"{cls}\nComponents: {', '.join(components)}"
45
+
46
+ def _copy(self):
47
+ """Deep copy of the object."""
48
+ return deepcopy(self)
49
+
50
+ def head(self, n=5):
51
+ """Show first n rows."""
52
+ if 'table' in self:
53
+ return self['table'].head(n)
54
+ if 'counts' in self:
55
+ return pd.DataFrame(
56
+ self['counts'][:n],
57
+ index=_get_rownames(self)[:n] if _get_rownames(self) is not None else None,
58
+ columns=_get_colnames(self) if _get_colnames(self) is not None else None
59
+ )
60
+ return None
61
+
62
+ def tail(self, n=5):
63
+ """Show last n rows."""
64
+ if 'table' in self:
65
+ return self['table'].tail(n)
66
+ if 'counts' in self:
67
+ return pd.DataFrame(
68
+ self['counts'][-n:],
69
+ index=_get_rownames(self)[-n:] if _get_rownames(self) is not None else None,
70
+ columns=_get_colnames(self) if _get_colnames(self) is not None else None
71
+ )
72
+ return None
73
+
74
+
75
+ def _get_rownames(obj):
76
+ """Get row names from genes or counts."""
77
+ if 'genes' in obj and obj['genes'] is not None:
78
+ return list(obj['genes'].index)
79
+ if 'counts' in obj and obj['counts'] is not None:
80
+ c = obj['counts']
81
+ if hasattr(c, 'index'):
82
+ return list(c.index)
83
+ return None
84
+
85
+
86
+ def _get_colnames(obj):
87
+ """Get column names from samples or counts."""
88
+ if 'samples' in obj and obj['samples'] is not None:
89
+ return list(obj['samples'].index)
90
+ return None
91
+
92
+
93
+ def _subset_matrix_or_df(x, i=None, j=None):
94
+ """Subset a matrix, DataFrame, or vector by row (i) and/or column (j)."""
95
+ if x is None:
96
+ return None
97
+ if isinstance(x, pd.DataFrame):
98
+ if i is not None and j is not None:
99
+ return x.iloc[i, j]
100
+ elif i is not None:
101
+ return x.iloc[i]
102
+ elif j is not None:
103
+ return x.iloc[:, j]
104
+ return x
105
+ if isinstance(x, np.ndarray):
106
+ if x.ndim == 2:
107
+ if i is not None and j is not None:
108
+ return x[np.ix_(np.atleast_1d(i), np.atleast_1d(j))] if not isinstance(i, slice) else x[i, :][:, j]
109
+ elif i is not None:
110
+ return x[i] if isinstance(i, slice) else x[np.atleast_1d(i)]
111
+ elif j is not None:
112
+ return x[:, j] if isinstance(j, slice) else x[:, np.atleast_1d(j)]
113
+ elif x.ndim == 1:
114
+ if i is not None:
115
+ return x[i] if isinstance(i, slice) else x[np.atleast_1d(i)]
116
+ return x
117
+ return x
118
+
119
+
120
+ def _resolve_index(idx, names):
121
+ """Resolve index to integer array. Supports bool, int, str, slice."""
122
+ if idx is None:
123
+ return None
124
+ if isinstance(idx, slice):
125
+ return idx
126
+ idx = np.atleast_1d(idx)
127
+ if idx.dtype == bool:
128
+ return np.where(idx)[0]
129
+ if idx.dtype.kind in ('U', 'S', 'O') and names is not None:
130
+ names_arr = np.asarray(names)
131
+ result = []
132
+ for name in idx:
133
+ matches = np.where(names_arr == name)[0]
134
+ if len(matches) == 0:
135
+ raise KeyError(f"Name '{name}' not found")
136
+ result.append(matches[0])
137
+ return np.array(result)
138
+ return idx.astype(int)
139
+
140
+
141
+ class DGEList(_EdgeRBase):
142
+ """Digital Gene Expression data list.
143
+
144
+ Attributes
145
+ ----------
146
+ counts : ndarray
147
+ Matrix of counts (genes x samples).
148
+ samples : DataFrame
149
+ Sample information with columns group, lib.size, norm.factors.
150
+ genes : DataFrame or None
151
+ Gene annotation.
152
+ common.dispersion : float or None
153
+ trended.dispersion : ndarray or None
154
+ tagwise.dispersion : ndarray or None
155
+ AveLogCPM : ndarray or None
156
+ offset : ndarray or None
157
+ weights : ndarray or None
158
+ """
159
+
160
+ _IJ = {'counts', 'pseudo.counts', 'offset', 'weights'}
161
+ _IX = {'genes'}
162
+ _JX = {'samples'}
163
+ _I = {'AveLogCPM', 'trended.dispersion', 'tagwise.dispersion', 'prior.n', 'prior.df'}
164
+
165
+ def __getitem__(self, key):
166
+ if isinstance(key, str):
167
+ return super().__getitem__(key)
168
+ if isinstance(key, tuple):
169
+ if len(key) == 2:
170
+ i, j = key
171
+ else:
172
+ raise IndexError("Two subscripts required")
173
+ else:
174
+ raise IndexError("Two subscripts required")
175
+
176
+ rownames = _get_rownames(self)
177
+ colnames = _get_colnames(self)
178
+ i_idx = _resolve_index(i, rownames)
179
+ j_idx = _resolve_index(j, colnames)
180
+
181
+ out = self._copy()
182
+
183
+ for k in self._IJ:
184
+ if k in out and out[k] is not None:
185
+ out[k] = _subset_matrix_or_df(out[k], i_idx, j_idx)
186
+ for k in self._IX:
187
+ if k in out and out[k] is not None:
188
+ out[k] = _subset_matrix_or_df(out[k], i_idx)
189
+ for k in self._JX:
190
+ if k in out and out[k] is not None:
191
+ out[k] = _subset_matrix_or_df(out[k], j=j_idx)
192
+ for k in self._I:
193
+ if k in out and out[k] is not None:
194
+ out[k] = _subset_matrix_or_df(out[k], i_idx)
195
+
196
+ # Drop empty group levels after column subsetting
197
+ if j_idx is not None and 'samples' in out and 'group' in out['samples'].columns:
198
+ out['samples']['group'] = out['samples']['group'].cat.remove_unused_categories() if hasattr(out['samples']['group'], 'cat') else out['samples']['group']
199
+
200
+ return out
201
+
202
+ def __setitem__(self, key, value):
203
+ super().__setitem__(key, value)
204
+
205
+ @property
206
+ def nrow(self):
207
+ if 'counts' in self:
208
+ return self['counts'].shape[0]
209
+ return 0
210
+
211
+ @property
212
+ def ncol(self):
213
+ if 'counts' in self:
214
+ return self['counts'].shape[1]
215
+ return 0
216
+
217
+ def __len__(self):
218
+ return self.nrow
219
+
220
+ def dim(self):
221
+ return self.shape
222
+
223
+ def dimnames(self):
224
+ return (_get_rownames(self), _get_colnames(self))
225
+
226
+ def to_dataframe(self):
227
+ """Convert counts to DataFrame."""
228
+ return pd.DataFrame(
229
+ self['counts'],
230
+ index=_get_rownames(self),
231
+ columns=_get_colnames(self)
232
+ )
233
+
234
+
235
+ class DGEExact(_EdgeRBase):
236
+ """Results of exact test for differential expression.
237
+
238
+ Attributes
239
+ ----------
240
+ table : DataFrame
241
+ With columns logFC, logCPM, PValue.
242
+ comparison : list
243
+ Two group names being compared.
244
+ genes : DataFrame or None
245
+ """
246
+
247
+ def __getitem__(self, key):
248
+ if isinstance(key, str):
249
+ return super().__getitem__(key)
250
+ if isinstance(key, tuple):
251
+ if len(key) == 2:
252
+ i, j = key
253
+ else:
254
+ raise IndexError("Two subscripts required")
255
+ else:
256
+ raise IndexError("Two subscripts required (rows, columns)")
257
+
258
+ if j is not None:
259
+ raise IndexError("Subsetting columns not allowed for DGEExact objects.")
260
+
261
+ out = self._copy()
262
+ rownames = list(out['table'].index) if out.get('table') is not None else None
263
+ i_idx = _resolve_index(i, rownames)
264
+
265
+ if 'table' in out and out['table'] is not None:
266
+ out['table'] = _subset_matrix_or_df(out['table'], i_idx)
267
+ if 'genes' in out and out['genes'] is not None:
268
+ out['genes'] = _subset_matrix_or_df(out['genes'], i_idx)
269
+ return out
270
+
271
+ def __repr__(self):
272
+ out = ""
273
+ if 'comparison' in self and self['comparison'] is not None:
274
+ out += f"Comparison of groups: {self['comparison'][1]}-{self['comparison'][0]}\n"
275
+ if 'table' in self and self['table'] is not None:
276
+ out += str(self['table'])
277
+ return out
278
+
279
+ @property
280
+ def shape(self):
281
+ if 'table' in self:
282
+ return self['table'].shape
283
+ return None
284
+
285
+
286
+ class DGEGLM(_EdgeRBase):
287
+ """Fitted GLM object for DGE data.
288
+
289
+ Attributes
290
+ ----------
291
+ coefficients : ndarray
292
+ Matrix of coefficients.
293
+ fitted.values : ndarray
294
+ deviance : ndarray
295
+ counts : ndarray
296
+ offset : ndarray or CompressedMatrix
297
+ weights : ndarray or None
298
+ design : ndarray
299
+ dispersion : float or ndarray
300
+ df.residual : ndarray
301
+ samples : DataFrame
302
+ genes : DataFrame or None
303
+ AveLogCPM : ndarray or None
304
+ """
305
+
306
+ _IX = {'counts', 'offset', 'weights', 'genes', 'coefficients', 'fitted.values',
307
+ 'unshrunk.coefficients', 'leverage', 'unit.deviance.adj', 'unit.df.adj'}
308
+ _I = {'AveLogCPM', 'dispersion', 'prior.n', 'prior.df', 's2.post', 's2.prior',
309
+ 'df.prior', 'df.residual', 'df.residual.zeros', 'df.residual.adj',
310
+ 'deviance', 'deviance.adj', 'iter', 'failed'}
311
+
312
+ def __getitem__(self, key):
313
+ if isinstance(key, str):
314
+ return super().__getitem__(key)
315
+ if isinstance(key, tuple):
316
+ if len(key) == 2:
317
+ i, j = key
318
+ else:
319
+ raise IndexError("Two subscripts required")
320
+ else:
321
+ raise IndexError("Two subscripts required")
322
+
323
+ if j is not None:
324
+ raise IndexError("Subsetting columns not allowed for DGEGLM object.")
325
+
326
+ out = self._copy()
327
+ rownames = _get_rownames(self)
328
+ i_idx = _resolve_index(i, rownames)
329
+
330
+ for k in self._IX:
331
+ if k in out and out[k] is not None:
332
+ out[k] = _subset_matrix_or_df(out[k], i_idx)
333
+ for k in self._I:
334
+ if k in out and out[k] is not None:
335
+ out[k] = _subset_matrix_or_df(out[k], i_idx)
336
+ return out
337
+
338
+ @property
339
+ def nrow(self):
340
+ if 'coefficients' in self:
341
+ return self['coefficients'].shape[0]
342
+ if 'counts' in self:
343
+ return self['counts'].shape[0]
344
+ return 0
345
+
346
+ @property
347
+ def ncol(self):
348
+ if 'counts' in self:
349
+ return self['counts'].shape[1]
350
+ return 0
351
+
352
+
353
+ class DGELRT(_EdgeRBase):
354
+ """Likelihood ratio test results for DGE GLMs.
355
+
356
+ Attributes
357
+ ----------
358
+ table : DataFrame
359
+ With columns logFC, logCPM, LR (or F), PValue.
360
+ comparison : str
361
+ Name of coefficient tested.
362
+ coefficients, fitted.values, etc. : inherited from DGEGLM fit.
363
+ """
364
+
365
+ _IX = {'counts', 'offset', 'weights', 'genes', 'coefficients', 'fitted.values',
366
+ 'table', 'unshrunk.coefficients', 'leverage', 'unit.deviance.adj', 'unit.df.adj'}
367
+ _I = {'AveLogCPM', 'dispersion', 'prior.n', 'prior.df', 's2.post', 's2.prior',
368
+ 'df.prior', 'df.residual', 'df.residual.zeros', 'df.residual.adj',
369
+ 'deviance', 'deviance.adj', 'iter', 'failed', 'df.test', 'df.total'}
370
+
371
+ def __getitem__(self, key):
372
+ if isinstance(key, str):
373
+ return super().__getitem__(key)
374
+ if isinstance(key, tuple):
375
+ if len(key) == 2:
376
+ i, j = key
377
+ else:
378
+ raise IndexError("Two subscripts required")
379
+ else:
380
+ raise IndexError("Two subscripts required")
381
+
382
+ if j is not None:
383
+ raise IndexError("Subsetting columns not allowed for DGELRT object.")
384
+
385
+ out = self._copy()
386
+ rownames = _get_rownames(self)
387
+ if rownames is None and 'table' in self:
388
+ rownames = list(self['table'].index)
389
+ i_idx = _resolve_index(i, rownames)
390
+
391
+ for k in self._IX:
392
+ if k in out and out[k] is not None:
393
+ out[k] = _subset_matrix_or_df(out[k], i_idx)
394
+ for k in self._I:
395
+ if k in out and out[k] is not None:
396
+ out[k] = _subset_matrix_or_df(out[k], i_idx)
397
+ return out
398
+
399
+ def __repr__(self):
400
+ out = ""
401
+ if 'comparison' in self:
402
+ out += f"Coefficient: {self['comparison']}\n"
403
+ if 'table' in self:
404
+ out += str(self['table'])
405
+ return out
406
+
407
+ @property
408
+ def shape(self):
409
+ if 'table' in self:
410
+ return self['table'].shape
411
+ if 'coefficients' in self:
412
+ return self['coefficients'].shape
413
+ return None
414
+
415
+
416
+ class TopTags(_EdgeRBase):
417
+ """Top differentially expressed genes.
418
+
419
+ Attributes
420
+ ----------
421
+ table : DataFrame
422
+ Sorted table of top genes.
423
+ adjust_method : str
424
+ comparison : str or list
425
+ test : str
426
+ Either 'exact' or 'glm'.
427
+ """
428
+
429
+ def __getitem__(self, key):
430
+ if isinstance(key, str):
431
+ return super().__getitem__(key)
432
+ if isinstance(key, tuple):
433
+ if len(key) == 2:
434
+ i, j = key
435
+ else:
436
+ raise IndexError("Two subscripts required")
437
+ else:
438
+ raise IndexError("Two subscripts required")
439
+
440
+ out = self._copy()
441
+ if 'table' in out:
442
+ if i is not None or j is not None:
443
+ out['table'] = _subset_matrix_or_df(out['table'], i, j)
444
+ return out
445
+
446
+ def __repr__(self):
447
+ out = ""
448
+ if self.get('test') == 'exact':
449
+ comp = self.get('comparison', [])
450
+ if len(comp) >= 2:
451
+ out += f"Comparison of groups: {comp[1]}-{comp[0]}\n"
452
+ else:
453
+ out += f"Coefficient: {self.get('comparison', '')}\n"
454
+ if 'table' in self:
455
+ out += str(self['table'])
456
+ return out
457
+
458
+ @property
459
+ def shape(self):
460
+ if 'table' in self:
461
+ return self['table'].shape
462
+ return None
463
+
464
+
465
+ def cbind_dgelist(*objects):
466
+ """Column-bind (combine samples) DGEList objects.
467
+
468
+ Port of edgeR's cbind.DGEList.
469
+ """
470
+ if len(objects) == 1:
471
+ return objects[0]
472
+
473
+ out = objects[0]._copy()
474
+ for obj in objects[1:]:
475
+ # Check gene compatibility
476
+ if 'genes' in out and out['genes'] is not None:
477
+ if not out['genes'].equals(obj.get('genes')):
478
+ raise ValueError("DGEList objects have different genes")
479
+
480
+ out['counts'] = np.hstack([out['counts'], obj['counts']])
481
+ out['samples'] = pd.concat([out['samples'], obj['samples']], ignore_index=False)
482
+
483
+ for key in ['offset', 'weights', 'pseudo.counts']:
484
+ if key in out and out[key] is not None and key in obj and obj[key] is not None:
485
+ out[key] = np.hstack([out[key], obj[key]])
486
+
487
+ # Clear dispersions
488
+ for key in ['common.dispersion', 'trended.dispersion', 'tagwise.dispersion']:
489
+ if key in out:
490
+ del out[key]
491
+
492
+ return out
493
+
494
+
495
+ def rbind_dgelist(*objects):
496
+ """Row-bind (combine genes) DGEList objects.
497
+
498
+ Port of edgeR's rbind.DGEList.
499
+ """
500
+ if len(objects) == 1:
501
+ return objects[0]
502
+
503
+ out = objects[0]._copy()
504
+ for obj in objects[1:]:
505
+ out['counts'] = np.vstack([out['counts'], obj['counts']])
506
+ if 'genes' in out and out['genes'] is not None and 'genes' in obj and obj['genes'] is not None:
507
+ out['genes'] = pd.concat([out['genes'], obj['genes']], ignore_index=False)
508
+ for key in ['offset', 'weights', 'pseudo.counts']:
509
+ if key in out and out[key] is not None and key in obj and obj[key] is not None:
510
+ out[key] = np.vstack([out[key], obj[key]])
511
+
512
+ # Clear dispersions
513
+ for key in ['common.dispersion', 'trended.dispersion', 'tagwise.dispersion', 'AveLogCPM']:
514
+ if key in out:
515
+ del out[key]
516
+
517
+ return out