f2a 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
f2a/__init__.py ADDED
@@ -0,0 +1,15 @@
1
+ """f2a — File to Analysis.
2
+
3
+ A library that automatically performs descriptive statistical analysis
4
+ and visualization from various data sources.
5
+
6
+ Usage:
7
+ >>> import f2a
8
+ >>> report = f2a.analyze("data.csv")
9
+ >>> report.show()
10
+ """
11
+
12
+ from f2a._version import __version__
13
+ from f2a.core.analyzer import analyze
14
+
15
+ __all__ = ["__version__", "analyze"]
f2a/_version.py ADDED
@@ -0,0 +1,8 @@
1
+ """Version information for f2a."""
2
+
3
+ try:
4
+ from importlib.metadata import version as _get_version
5
+
6
+ __version__: str = _get_version("f2a")
7
+ except Exception:
8
+ __version__ = "0.1.0"
f2a/core/__init__.py ADDED
@@ -0,0 +1,7 @@
1
+ """Core module — data loading, analysis orchestration, and schema inference."""
2
+
3
+ from f2a.core.loader import DataLoader
4
+ from f2a.core.analyzer import analyze, Analyzer
5
+ from f2a.core.schema import DataSchema, infer_schema
6
+
7
+ __all__ = ["DataLoader", "analyze", "Analyzer", "DataSchema", "infer_schema"]
f2a/core/analyzer.py ADDED
@@ -0,0 +1,454 @@
1
+ """Analysis orchestrator — coordinates the entire analysis pipeline."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from dataclasses import dataclass, field
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ import matplotlib.pyplot as plt
11
+ import pandas as pd
12
+
13
+ from f2a.core.loader import DataLoader
14
+ from f2a.core.schema import DataSchema, infer_schema
15
+ from f2a.stats.descriptive import DescriptiveStats
16
+ from f2a.stats.distribution import DistributionStats
17
+ from f2a.stats.correlation import CorrelationStats
18
+ from f2a.stats.missing import MissingStats
19
+ from f2a.viz.plots import BasicPlotter
20
+ from f2a.viz.corr_plots import CorrelationPlotter
21
+ from f2a.viz.missing_plots import MissingPlotter
22
+ from f2a.report.generator import ReportGenerator
23
+ from f2a.utils.logging import get_logger
24
+ from f2a.utils.validators import validate_source
25
+
26
+ logger = get_logger(__name__)
27
+
28
+
29
+ @dataclass
30
+ class StatsResult:
31
+ """Container for statistical analysis results."""
32
+
33
+ summary: pd.DataFrame
34
+ numeric_summary: pd.DataFrame
35
+ categorical_summary: pd.DataFrame
36
+ correlation_matrix: pd.DataFrame
37
+ missing_info: pd.DataFrame
38
+ distribution_info: pd.DataFrame
39
+
40
+ def get_numeric_summary(self) -> pd.DataFrame:
41
+ """Return numeric column summary."""
42
+ return self.numeric_summary
43
+
44
+ def get_categorical_summary(self) -> pd.DataFrame:
45
+ """Return categorical column summary."""
46
+ return self.categorical_summary
47
+
48
+
49
+ @dataclass
50
+ class VizResult:
51
+ """Container for visualization results."""
52
+
53
+ _df: pd.DataFrame
54
+ _schema: DataSchema
55
+ _figures: dict[str, plt.Figure] = field(default_factory=dict)
56
+
57
+ def plot_distributions(self) -> plt.Figure:
58
+ """Return distribution histograms for numeric columns."""
59
+ plotter = BasicPlotter(self._df, self._schema)
60
+ fig = plotter.histograms()
61
+ self._figures["distributions"] = fig
62
+ return fig
63
+
64
+ def plot_boxplots(self) -> plt.Figure:
65
+ """Return boxplots for numeric columns."""
66
+ plotter = BasicPlotter(self._df, self._schema)
67
+ fig = plotter.boxplots()
68
+ self._figures["boxplots"] = fig
69
+ return fig
70
+
71
+ def plot_correlation(self, method: str = "pearson") -> plt.Figure:
72
+ """Return correlation heatmap."""
73
+ plotter = CorrelationPlotter(self._df, self._schema)
74
+ fig = plotter.heatmap(method=method)
75
+ self._figures["correlation"] = fig
76
+ return fig
77
+
78
+ def plot_missing(self) -> plt.Figure:
79
+ """Return missing data bar chart."""
80
+ plotter = MissingPlotter(self._df, self._schema)
81
+ fig = plotter.bar()
82
+ self._figures["missing"] = fig
83
+ return fig
84
+
85
+
86
+ @dataclass
87
+ class SubsetReport:
88
+ """Analysis results for a single subset/split partition."""
89
+
90
+ subset: str
91
+ split: str
92
+ shape: tuple[int, int]
93
+ schema: DataSchema
94
+ stats: StatsResult
95
+ viz: VizResult
96
+ warnings: list[str] = field(default_factory=list)
97
+
98
+
99
+ @dataclass
100
+ class AnalysisReport:
101
+ """Top-level container for analysis results.
102
+
103
+ Attributes:
104
+ dataset_name: Dataset name.
105
+ shape: ``(rows, columns)`` tuple (total across all subsets).
106
+ schema: Data schema (of the first / single partition).
107
+ stats: Statistical analysis results (of the first / single partition).
108
+ viz: Visualization access object (of the first / single partition).
109
+ warnings: List of warnings found during analysis.
110
+ subsets: Per-subset/split reports (empty when only one partition).
111
+ """
112
+
113
+ dataset_name: str
114
+ shape: tuple[int, int]
115
+ schema: DataSchema
116
+ stats: StatsResult
117
+ viz: VizResult
118
+ warnings: list[str] = field(default_factory=list)
119
+ subsets: list[SubsetReport] = field(default_factory=list)
120
+
121
+ def show(self) -> None:
122
+ """Print analysis summary to console."""
123
+ sep = "=" * 60
124
+ print(sep)
125
+ print(f" f2a Analysis Report: {self.dataset_name}")
126
+ print(sep)
127
+
128
+ if self.subsets:
129
+ # Multi-subset mode
130
+ print(f"\n Total Rows: {self.shape[0]:,} | Subsets: {len(self.subsets)}")
131
+ for sr in self.subsets:
132
+ print(f"\n{'-' * 60}")
133
+ print(f" [{sr.subset} / {sr.split}] {sr.shape[0]:,} rows x {sr.shape[1]} cols")
134
+ print(f" Memory: {sr.schema.memory_usage_mb} MB")
135
+ print(f" Numeric: {len(sr.schema.numeric_columns)} | "
136
+ f"Categorical: {len(sr.schema.categorical_columns)} | "
137
+ f"Text: {len(sr.schema.text_columns)} | "
138
+ f"Datetime: {len(sr.schema.datetime_columns)}")
139
+ print()
140
+ print(sr.stats.summary.to_string())
141
+ if sr.warnings:
142
+ print("\n Warnings:")
143
+ for w in sr.warnings:
144
+ print(f" - {w}")
145
+ else:
146
+ # Single-partition mode
147
+ print(f"\n Rows: {self.shape[0]:,} | Columns: {self.shape[1]}")
148
+ print(f" Memory: {self.schema.memory_usage_mb} MB")
149
+ print(f"\n Numeric: {len(self.schema.numeric_columns)}")
150
+ print(f" Categorical: {len(self.schema.categorical_columns)}")
151
+ print(f" Text: {len(self.schema.text_columns)}")
152
+ print(f" Datetime: {len(self.schema.datetime_columns)}")
153
+
154
+ print(f"\n{'-' * 60}")
155
+ print(" Summary Statistics:")
156
+ print(self.stats.summary.to_string())
157
+
158
+ if self.warnings:
159
+ print(f"\n{'-' * 60}")
160
+ print(" Warnings:")
161
+ for w in self.warnings:
162
+ print(f" - {w}")
163
+
164
+ print(sep)
165
+
166
+ def to_html(self, output_dir: str = ".") -> Path:
167
+ """Generate and save an HTML report.
168
+
169
+ Args:
170
+ output_dir: Output directory path.
171
+
172
+ Returns:
173
+ Path to the saved HTML file.
174
+ """
175
+ generator = ReportGenerator()
176
+ safe_name = re.sub(r'[<>:"/\\|?*]', "_", self.dataset_name)
177
+ safe_name = safe_name.strip(". ")[:120] or "report"
178
+ output_path = Path(output_dir) / f"{safe_name}_report.html"
179
+
180
+ if self.subsets:
181
+ # Multi-subset mode: build per-subset section dicts
182
+ subset_sections: list[dict[str, Any]] = []
183
+ for sr in self.subsets:
184
+ figures: dict[str, plt.Figure] = {}
185
+ try:
186
+ figures["Distribution Histograms"] = sr.viz.plot_distributions()
187
+ except Exception:
188
+ pass
189
+ try:
190
+ figures["Boxplots"] = sr.viz.plot_boxplots()
191
+ except Exception:
192
+ pass
193
+ try:
194
+ figures["Correlation Heatmap"] = sr.viz.plot_correlation()
195
+ except Exception:
196
+ pass
197
+ try:
198
+ figures["Missing Data"] = sr.viz.plot_missing()
199
+ except Exception:
200
+ pass
201
+ subset_sections.append({
202
+ "subset": sr.subset,
203
+ "split": sr.split,
204
+ "schema_summary": sr.schema.summary_dict(),
205
+ "stats_df": sr.stats.summary,
206
+ "figures": figures,
207
+ "warnings": sr.warnings,
208
+ })
209
+ generator.save_html_multi(
210
+ output_path=output_path,
211
+ dataset_name=self.dataset_name,
212
+ sections=subset_sections,
213
+ )
214
+ else:
215
+ # Single-partition mode
216
+ figures: dict[str, plt.Figure] = {}
217
+ try:
218
+ figures["Distribution Histograms"] = self.viz.plot_distributions()
219
+ except Exception:
220
+ pass
221
+ try:
222
+ figures["Boxplots"] = self.viz.plot_boxplots()
223
+ except Exception:
224
+ pass
225
+ try:
226
+ figures["Correlation Heatmap"] = self.viz.plot_correlation()
227
+ except Exception:
228
+ pass
229
+ try:
230
+ figures["Missing Data"] = self.viz.plot_missing()
231
+ except Exception:
232
+ pass
233
+ generator.save_html(
234
+ output_path=output_path,
235
+ dataset_name=self.dataset_name,
236
+ schema_summary=self.schema.summary_dict(),
237
+ stats_df=self.stats.summary,
238
+ figures=figures,
239
+ warnings=self.warnings,
240
+ )
241
+ return output_path
242
+
243
+ def to_dict(self) -> dict[str, Any]:
244
+ """Return analysis results as a dictionary."""
245
+ result: dict[str, Any] = {
246
+ "dataset_name": self.dataset_name,
247
+ "shape": self.shape,
248
+ "schema": self.schema.summary_dict(),
249
+ "stats_summary": self.stats.summary.to_dict(),
250
+ "correlation_matrix": self.stats.correlation_matrix.to_dict()
251
+ if not self.stats.correlation_matrix.empty
252
+ else {},
253
+ "warnings": self.warnings,
254
+ }
255
+ if self.subsets:
256
+ result["subsets"] = [
257
+ {
258
+ "subset": sr.subset,
259
+ "split": sr.split,
260
+ "shape": sr.shape,
261
+ "schema": sr.schema.summary_dict(),
262
+ "stats_summary": sr.stats.summary.to_dict(),
263
+ "warnings": sr.warnings,
264
+ }
265
+ for sr in self.subsets
266
+ ]
267
+ return result
268
+
269
+
270
+ class Analyzer:
271
+ """Orchestrate the analysis pipeline.
272
+
273
+ Example:
274
+ >>> analyzer = Analyzer()
275
+ >>> report = analyzer.run("data.csv")
276
+ >>> report.show()
277
+ """
278
+
279
+ def __init__(self) -> None:
280
+ self._loader = DataLoader()
281
+
282
+ def run(self, source: str, **kwargs: Any) -> AnalysisReport:
283
+ """Execute the full analysis pipeline.
284
+
285
+ Args:
286
+ source: Data source (file path or HuggingFace address).
287
+ **kwargs: Additional arguments passed to the loader.
288
+
289
+ Returns:
290
+ :class:`AnalysisReport` instance.
291
+ """
292
+ source = validate_source(source)
293
+ logger.info("Analysis started: %s", source)
294
+
295
+ # 1. Load data
296
+ df = self._loader.load(source, **kwargs)
297
+
298
+ # 2. Check for multi-subset HuggingFace data
299
+ has_partitions = "__subset__" in df.columns and "__split__" in df.columns
300
+
301
+ if has_partitions:
302
+ return self._run_multi_subset(source, df)
303
+
304
+ # Single-partition analysis
305
+ return self._run_single(source, df)
306
+
307
+ def _run_single(
308
+ self, source: str, df: pd.DataFrame
309
+ ) -> AnalysisReport:
310
+ """Run analysis on a single DataFrame."""
311
+ schema = infer_schema(df)
312
+ logger.info("Schema inference complete: %s", schema.summary_dict())
313
+
314
+ warnings: list[str] = []
315
+ stats = self._compute_stats(df, schema, warnings)
316
+
317
+ dataset_name = (
318
+ Path(source).stem
319
+ if "/" not in source or "://" not in source
320
+ else source
321
+ )
322
+ viz = VizResult(_df=df, _schema=schema)
323
+
324
+ report = AnalysisReport(
325
+ dataset_name=dataset_name,
326
+ shape=(len(df), len(df.columns)),
327
+ schema=schema,
328
+ stats=stats,
329
+ viz=viz,
330
+ warnings=warnings,
331
+ )
332
+ logger.info("Analysis complete: %s", source)
333
+ return report
334
+
335
+ def _run_multi_subset(
336
+ self, source: str, df: pd.DataFrame
337
+ ) -> AnalysisReport:
338
+ """Run analysis on a multi-subset HuggingFace DataFrame."""
339
+ groups = df.groupby(["__subset__", "__split__"], sort=False)
340
+
341
+ subset_reports: list[SubsetReport] = []
342
+ all_warnings: list[str] = []
343
+
344
+ for (subset_name, split_name), group_df in groups:
345
+ # Drop the metadata columns before analysis
346
+ part_df = group_df.drop(columns=["__subset__", "__split__"]).reset_index(drop=True)
347
+
348
+ schema = infer_schema(part_df)
349
+ warnings: list[str] = []
350
+ stats = self._compute_stats(part_df, schema, warnings)
351
+ viz = VizResult(_df=part_df, _schema=schema)
352
+
353
+ sr = SubsetReport(
354
+ subset=str(subset_name),
355
+ split=str(split_name),
356
+ shape=(len(part_df), len(part_df.columns)),
357
+ schema=schema,
358
+ stats=stats,
359
+ viz=viz,
360
+ warnings=warnings,
361
+ )
362
+ subset_reports.append(sr)
363
+ all_warnings.extend(
364
+ f"[{subset_name}/{split_name}] {w}" for w in warnings
365
+ )
366
+ logger.info(
367
+ "Subset analysis complete: %s/%s (%d rows × %d cols)",
368
+ subset_name, split_name, len(part_df), len(part_df.columns),
369
+ )
370
+
371
+ # Use the first subset for top-level schema/stats/viz
372
+ first = subset_reports[0]
373
+ total_rows = sum(sr.shape[0] for sr in subset_reports)
374
+ total_cols = first.shape[1]
375
+
376
+ report = AnalysisReport(
377
+ dataset_name=source,
378
+ shape=(total_rows, total_cols),
379
+ schema=first.schema,
380
+ stats=first.stats,
381
+ viz=first.viz,
382
+ warnings=all_warnings,
383
+ subsets=subset_reports,
384
+ )
385
+ logger.info(
386
+ "Multi-subset analysis complete: %s (%d subsets, %d total rows)",
387
+ source, len(subset_reports), total_rows,
388
+ )
389
+ return report
390
+
391
+ def _compute_stats(
392
+ self,
393
+ df: pd.DataFrame,
394
+ schema: DataSchema,
395
+ warnings: list[str],
396
+ ) -> StatsResult:
397
+ """Perform all statistical analyses."""
398
+ desc = DescriptiveStats(df, schema)
399
+ dist = DistributionStats(df, schema)
400
+ corr = CorrelationStats(df, schema)
401
+ miss = MissingStats(df, schema)
402
+
403
+ # Descriptive statistics
404
+ summary = desc.summary()
405
+ numeric_summary = desc.numeric_summary()
406
+ categorical_summary = desc.categorical_summary()
407
+
408
+ # Correlation analysis
409
+ correlation_matrix = corr.pearson()
410
+ high_corrs = corr.high_correlations(threshold=0.9)
411
+ for col_a, col_b, val in high_corrs:
412
+ warnings.append(f"High correlation: {col_a} ↔ {col_b} (r={val})")
413
+
414
+ # Missing data
415
+ missing_info = miss.column_summary()
416
+ total_missing = miss.total_missing_ratio()
417
+ if total_missing > 0.1:
418
+ warnings.append(f"Overall missing ratio is high: {total_missing * 100:.1f}%")
419
+
420
+ # Distribution
421
+ distribution_info = dist.analyze()
422
+
423
+ return StatsResult(
424
+ summary=summary,
425
+ numeric_summary=numeric_summary,
426
+ categorical_summary=categorical_summary,
427
+ correlation_matrix=correlation_matrix,
428
+ missing_info=missing_info,
429
+ distribution_info=distribution_info,
430
+ )
431
+
432
+
433
+ def analyze(source: str, **kwargs: Any) -> AnalysisReport:
434
+ """Analyze a data source and return a report.
435
+
436
+ This function is the main entry point for ``f2a``.
437
+
438
+ Args:
439
+ source: File path or HuggingFace dataset address.
440
+ - File: ``"data.csv"``, ``"data.json"``, ``"data.parquet"``
441
+ - HuggingFace: ``"hf://imdb"``, ``"hf://squad"``
442
+ **kwargs: Additional arguments passed to the data loader.
443
+
444
+ Returns:
445
+ :class:`AnalysisReport` — statistics, visualization, and report access object.
446
+
447
+ Example:
448
+ >>> import f2a
449
+ >>> report = f2a.analyze("sales.csv")
450
+ >>> report.show()
451
+ >>> report.to_html("output/")
452
+ """
453
+ analyzer = Analyzer()
454
+ return analyzer.run(source, **kwargs)