ultrasav 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ultrasav/__init__.py +280 -0
- ultrasav/_add_cases.py +227 -0
- ultrasav/_data.py +513 -0
- ultrasav/_make_dummy.py +137 -0
- ultrasav/_merge_data.py +435 -0
- ultrasav/_merge_meta.py +280 -0
- ultrasav/_metadata.py +570 -0
- ultrasav/_read_files.py +558 -0
- ultrasav/_write_files.py +111 -0
- ultrasav/metaman/__init__.py +91 -0
- ultrasav/metaman/def_detect_variable_type.py +454 -0
- ultrasav/metaman/def_get_meta.py +561 -0
- ultrasav/metaman/def_make_datamap.py +127 -0
- ultrasav/metaman/def_make_labels.py +833 -0
- ultrasav/metaman/def_map_engine.py +529 -0
- ultrasav/metaman/def_map_to_excel.py +294 -0
- ultrasav/metaman/def_write_excel_engine.py +298 -0
- ultrasav/metaman/pastel_color_schemes.py +185 -0
- ultrasav-0.1.4.dist-info/METADATA +550 -0
- ultrasav-0.1.4.dist-info/RECORD +21 -0
- ultrasav-0.1.4.dist-info/WHEEL +4 -0
ultrasav/__init__.py
ADDED
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ultrasav - Ultra-powerful Python package for SPSS/SAV file processing.
|
|
3
|
+
|
|
4
|
+
ultrasav separates data and metadata operations into independent tracks that only
|
|
5
|
+
converge at read/write time. This design provides explicit control, clean separation
|
|
6
|
+
of concerns, and flexibility when working with SPSS data files.
|
|
7
|
+
|
|
8
|
+
⚡ Specium Ray for your data!
|
|
9
|
+
|
|
10
|
+
Basic usage:
|
|
11
|
+
>>> import ultrasav as ul
|
|
12
|
+
>>> # Read SPSS file - splits into two tracks
|
|
13
|
+
>>> data, meta = ul.read_sav("survey.sav")
|
|
14
|
+
>>>
|
|
15
|
+
>>> # Track 1: Process data independently
|
|
16
|
+
>>> data = ul.Data(data)
|
|
17
|
+
>>> data = data.move(first=["ID"]).rename({"Q1": "Question1"})
|
|
18
|
+
>>>
|
|
19
|
+
>>> # Track 2: Process metadata independently
|
|
20
|
+
>>> meta.column_labels = {"Question1": "Customer Satisfaction"}
|
|
21
|
+
>>>
|
|
22
|
+
>>> # Convergence: Write both tracks to SPSS
|
|
23
|
+
>>> ul.write_sav(data.to_native(), meta, "output.sav")
|
|
24
|
+
|
|
25
|
+
Main components:
|
|
26
|
+
- Data: Handle dataframe operations (move, rename, replace, drop, select)
|
|
27
|
+
- Metadata: Handle SPSS metadata (labels, formats, measures, missing values)
|
|
28
|
+
- read_sav/write_sav: File I/O for SPSS format
|
|
29
|
+
- add_cases: High-level function for merging files with metadata
|
|
30
|
+
- merge_data/merge_meta: Lower-level merge operations
|
|
31
|
+
|
|
32
|
+
Metadata tools (via metaman submodule):
|
|
33
|
+
- get_meta: Extract metadata to Python files
|
|
34
|
+
- make_labels: Create label dictionaries from Excel
|
|
35
|
+
- make_datamap: Build validation datamaps
|
|
36
|
+
- map_to_excel: Export formatted Excel reports
|
|
37
|
+
- detect_variable_type: Detect variable types (single/multi-select, etc.)
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
# __version__ = "0.1.0"
|
|
41
|
+
from importlib.metadata import version
|
|
42
|
+
__version__ = version("ultrasav")
|
|
43
|
+
__author__ = "Albert Li"
|
|
44
|
+
|
|
45
|
+
# =============================================================================
|
|
46
|
+
# Core classes
|
|
47
|
+
# =============================================================================
|
|
48
|
+
from ._data import Data
|
|
49
|
+
from ._metadata import Metadata
|
|
50
|
+
|
|
51
|
+
# =============================================================================
|
|
52
|
+
# I/O functions
|
|
53
|
+
# =============================================================================
|
|
54
|
+
from ._read_files import read_sav, read_csv, read_excel
|
|
55
|
+
from ._write_files import write_sav
|
|
56
|
+
|
|
57
|
+
# =============================================================================
|
|
58
|
+
# Merge functions
|
|
59
|
+
# =============================================================================
|
|
60
|
+
from ._merge_data import merge_data
|
|
61
|
+
from ._merge_meta import merge_meta, get_meta_summary
|
|
62
|
+
|
|
63
|
+
# =============================================================================
|
|
64
|
+
# High-level functions
|
|
65
|
+
# =============================================================================
|
|
66
|
+
from ._add_cases import add_cases
|
|
67
|
+
from ._make_dummy import make_dummies
|
|
68
|
+
|
|
69
|
+
# =============================================================================
|
|
70
|
+
# Metaman submodule - The Metadata Superhero 🦸
|
|
71
|
+
# =============================================================================
|
|
72
|
+
# 1.Submodule namespace access
|
|
73
|
+
from . import metaman
|
|
74
|
+
# ultrasav.metaman.get_meta(...)
|
|
75
|
+
# from ultrasav import metaman
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
# 2.Top-level function access
|
|
79
|
+
# Re-export metaman's public API at top level for convenience
|
|
80
|
+
from .metaman import (
|
|
81
|
+
# Metadata extraction
|
|
82
|
+
get_meta,
|
|
83
|
+
|
|
84
|
+
# Label creation
|
|
85
|
+
make_labels,
|
|
86
|
+
|
|
87
|
+
# Variable type detection
|
|
88
|
+
detect_variable_type,
|
|
89
|
+
# create_mr_set_lookup,
|
|
90
|
+
|
|
91
|
+
# Datamap creation
|
|
92
|
+
make_datamap,
|
|
93
|
+
# map_engine,
|
|
94
|
+
|
|
95
|
+
# Excel export
|
|
96
|
+
map_to_excel,
|
|
97
|
+
# write_excel_engine,
|
|
98
|
+
|
|
99
|
+
# Color schemes
|
|
100
|
+
get_color_scheme,
|
|
101
|
+
COLOR_SCHEMES,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# =============================================================================
|
|
105
|
+
# Public API
|
|
106
|
+
# =============================================================================
|
|
107
|
+
__all__ = [
|
|
108
|
+
# Version
|
|
109
|
+
"__version__",
|
|
110
|
+
|
|
111
|
+
# Core classes
|
|
112
|
+
"Data",
|
|
113
|
+
"Metadata",
|
|
114
|
+
|
|
115
|
+
# Read functions
|
|
116
|
+
"read_sav",
|
|
117
|
+
"read_csv",
|
|
118
|
+
"read_excel",
|
|
119
|
+
|
|
120
|
+
# Write function
|
|
121
|
+
"write_sav",
|
|
122
|
+
|
|
123
|
+
# Merge functions
|
|
124
|
+
"merge_data",
|
|
125
|
+
"merge_meta",
|
|
126
|
+
"get_meta_summary",
|
|
127
|
+
|
|
128
|
+
# High-level functions
|
|
129
|
+
"add_cases",
|
|
130
|
+
"make_dummies",
|
|
131
|
+
|
|
132
|
+
# Metaman submodule
|
|
133
|
+
"metaman",
|
|
134
|
+
|
|
135
|
+
# Metaman re-exports (top-level access)
|
|
136
|
+
"get_meta",
|
|
137
|
+
"make_labels",
|
|
138
|
+
"detect_variable_type",
|
|
139
|
+
# "create_mr_set_lookup",
|
|
140
|
+
"make_datamap",
|
|
141
|
+
# "map_engine",
|
|
142
|
+
"map_to_excel",
|
|
143
|
+
# "write_excel_engine",
|
|
144
|
+
"get_color_scheme",
|
|
145
|
+
"COLOR_SCHEMES",
|
|
146
|
+
]
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
# =============================================================================
|
|
150
|
+
# Utility functions
|
|
151
|
+
# =============================================================================
|
|
152
|
+
|
|
153
|
+
def _show_architecture():
|
|
154
|
+
"""Display the ultrasav two-track architecture diagram."""
|
|
155
|
+
architecture = """
|
|
156
|
+
⚡ ULTRASAV ARCHITECTURE ⚡
|
|
157
|
+
==========================
|
|
158
|
+
|
|
159
|
+
The Two-Track System:
|
|
160
|
+
|
|
161
|
+
┌─────────────┐ ┌─────────────┐
|
|
162
|
+
│ DATA │ │ METADATA │
|
|
163
|
+
│ │ │ │
|
|
164
|
+
│ DataFrame │ │ Labels │
|
|
165
|
+
│ Columns │ │ Formats │
|
|
166
|
+
│ Values │ │ Measures │
|
|
167
|
+
└─────────────┘ └─────────────┘
|
|
168
|
+
│ │
|
|
169
|
+
│ Independent Work │
|
|
170
|
+
│ │
|
|
171
|
+
└────────┬────────────────┘
|
|
172
|
+
│
|
|
173
|
+
▼
|
|
174
|
+
┌─────────────┐
|
|
175
|
+
│ WRITE SAV │
|
|
176
|
+
└─────────────┘
|
|
177
|
+
|
|
178
|
+
Key Principle:
|
|
179
|
+
--------------
|
|
180
|
+
Data and Metadata are two independent layers that only come together at read/write time.
|
|
181
|
+
Like Ultraman's form changes, data can transform independently of its metadata!
|
|
182
|
+
|
|
183
|
+
Workflow:
|
|
184
|
+
---------
|
|
185
|
+
1. READING (Splitting): read_sav() → (data, metadata)
|
|
186
|
+
2. PROCESSING (Parallel):
|
|
187
|
+
- Data operations via Data class
|
|
188
|
+
- Metadata operations via Metadata class
|
|
189
|
+
3. WRITING (Reunification): write_sav(data, metadata)
|
|
190
|
+
|
|
191
|
+
Benefits:
|
|
192
|
+
---------
|
|
193
|
+
• Explicit Control - No hidden magic or automatic transfers
|
|
194
|
+
• Clean Separation - Each class has single responsibility
|
|
195
|
+
• Flexibility - Mix and match data/metadata from different sources
|
|
196
|
+
• Easy to Extend - Clear boundaries for new features
|
|
197
|
+
|
|
198
|
+
Metaman Submodule:
|
|
199
|
+
------------------
|
|
200
|
+
🦸 The metadata superhero for inspection & reporting:
|
|
201
|
+
• get_meta() - Extract metadata to Python files
|
|
202
|
+
• make_labels() - Create labels from Excel templates
|
|
203
|
+
• make_datamap() - Build validation datamaps
|
|
204
|
+
• map_to_excel() - Export formatted Excel reports
|
|
205
|
+
"""
|
|
206
|
+
print(architecture)
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def show_arch():
|
|
210
|
+
"""Alias for show_architecture()."""
|
|
211
|
+
return _show_architecture()
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _quick_merge(files, output_file=None, source_col="mrgsrc", **kwargs):
|
|
215
|
+
"""
|
|
216
|
+
Convenience function for quick file merging.
|
|
217
|
+
Ultra-fast like Ultraman's flight speed!
|
|
218
|
+
|
|
219
|
+
Parameters
|
|
220
|
+
----------
|
|
221
|
+
files : list
|
|
222
|
+
List of file paths to merge
|
|
223
|
+
output_file : str, optional
|
|
224
|
+
If provided, writes merged data to this file
|
|
225
|
+
source_col : str, default "mrgsrc"
|
|
226
|
+
Name of column tracking data sources
|
|
227
|
+
**kwargs
|
|
228
|
+
Additional arguments passed to add_cases
|
|
229
|
+
|
|
230
|
+
Returns
|
|
231
|
+
-------
|
|
232
|
+
DataFrame, Metadata or None
|
|
233
|
+
Merged data and metadata if output_file not specified
|
|
234
|
+
|
|
235
|
+
Examples
|
|
236
|
+
--------
|
|
237
|
+
>>> import ultrasav as ul
|
|
238
|
+
>>> # Quick merge and save
|
|
239
|
+
>>> ul._quick_merge(["file1.sav", "file2.sav"], "merged.sav")
|
|
240
|
+
|
|
241
|
+
>>> # Quick merge and return
|
|
242
|
+
>>> data, meta = ul._quick_merge(["file1.sav", "file2.sav"])
|
|
243
|
+
"""
|
|
244
|
+
data, meta = add_cases(files, source_col=source_col, **kwargs)
|
|
245
|
+
|
|
246
|
+
if output_file:
|
|
247
|
+
write_sav(data, meta, output_file)
|
|
248
|
+
print(f"⚡ Specium Ray successful! Merged {len(files)} files → {output_file}")
|
|
249
|
+
else:
|
|
250
|
+
return data, meta
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
# Add utility functions to __all__
|
|
254
|
+
__all__.extend(["show_arch"])
|
|
255
|
+
# __all__.extend(["_show_architecture", "show_arch", "_quick_merge"])
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
# =============================================================================
|
|
259
|
+
# Package metadata
|
|
260
|
+
# =============================================================================
|
|
261
|
+
|
|
262
|
+
def about():
|
|
263
|
+
"""Display package information."""
|
|
264
|
+
info = f"""
|
|
265
|
+
⚡ ultrasav v{__version__} ⚡
|
|
266
|
+
========================
|
|
267
|
+
Ultra-powerful Python package for SPSS/SAV file processing.
|
|
268
|
+
|
|
269
|
+
Author: {__author__}
|
|
270
|
+
|
|
271
|
+
"Specium Ray for your data!"
|
|
272
|
+
|
|
273
|
+
Core components: Data, Metadata, read_sav, write_sav
|
|
274
|
+
Metaman tools: get_meta, make_labels, make_datamap, map_to_excel
|
|
275
|
+
|
|
276
|
+
Use ul.show_architecture() to see the two-track design.
|
|
277
|
+
"""
|
|
278
|
+
print(info)
|
|
279
|
+
|
|
280
|
+
__all__.append("about")
|
ultrasav/_add_cases.py
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
"""
|
|
2
|
+
add_cases.py
|
|
3
|
+
Top-level function for merging SAV files or dataframes with metadata
|
|
4
|
+
Following tidyspss 2.0's two-track architecture
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
from narwhals.typing import IntoFrame
|
|
11
|
+
|
|
12
|
+
from .def_merge_data import merge_data
|
|
13
|
+
from .def_merge_meta import merge_meta
|
|
14
|
+
from .def_read_files import read_sav, read_csv, read_excel
|
|
15
|
+
from .class_metadata import Metadata
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def add_cases(
|
|
22
|
+
# inputs: list[str | Path | IntoFrame | tuple[IntoFrame, Any]],
|
|
23
|
+
inputs: list[str | Path | Any],
|
|
24
|
+
meta: list[Any | None] | None = None,
|
|
25
|
+
output_format: str = "polars",
|
|
26
|
+
source_col: str = "mrgsrc",
|
|
27
|
+
meta_strategy: str = "first"
|
|
28
|
+
) -> tuple[Any, Any | None]:
|
|
29
|
+
"""
|
|
30
|
+
Merge multiple SAV/CSV/Excel files or dataframes with their metadata.
|
|
31
|
+
|
|
32
|
+
This is the main entry point for merging that combines both data and metadata
|
|
33
|
+
merging following tidyspss's two-track architecture. Data and metadata are
|
|
34
|
+
merged independently and returned as a tuple.
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
inputs : list[str | Path | DataFrame | tuple[DataFrame, Metadata]]
|
|
39
|
+
List of inputs to merge. Each element can be:
|
|
40
|
+
- File path (str or Path) to:
|
|
41
|
+
* SAV/ZSAV files (metadata extracted automatically)
|
|
42
|
+
* CSV files (no metadata)
|
|
43
|
+
* Excel files (.xlsx, .xls, .xlsm, .xlsb, .ods) (no metadata)
|
|
44
|
+
- A dataframe (pandas, polars, or any narwhals-supported format) without metadata
|
|
45
|
+
- A combination of file paths (str/Path) and dataframes (pandas/polars/narwhals)
|
|
46
|
+
- A tuple of (dataframe, metadata) for explicit data-metadata pairs
|
|
47
|
+
meta : list[Metadata | None] | None, optional
|
|
48
|
+
Optional list of metadata objects to use for merging.
|
|
49
|
+
- If None (default): metadata is automatically extracted from SAV files
|
|
50
|
+
- If provided: uses these metadata objects for merging, ignoring any
|
|
51
|
+
metadata from SAV files. The list does NOT need to match input length.
|
|
52
|
+
Common usage: provide 1-2 metadata objects to merge, regardless of number of inputs
|
|
53
|
+
source_col : str, default "mrgsrc"
|
|
54
|
+
Name of the provenance column to add to track data sources.
|
|
55
|
+
This column will contain:
|
|
56
|
+
- For file paths: the base filename (e.g., "survey_2024.sav", "data.csv", "report.xlsx")
|
|
57
|
+
- For dataframes: "source_1", "source_2", etc.
|
|
58
|
+
output_format : str, default "polars"
|
|
59
|
+
Output dataframe format: "pandas", "polars", or "narwhals"
|
|
60
|
+
meta_strategy : str, default "first"
|
|
61
|
+
Strategy for merging metadata:
|
|
62
|
+
- "first": Use first non-None meta as base, add new columns from others
|
|
63
|
+
- "last": Use last non-None meta as base, add new columns from others
|
|
64
|
+
|
|
65
|
+
Returns
|
|
66
|
+
-------
|
|
67
|
+
tuple[DataFrame, Metadata | None]
|
|
68
|
+
- Merged dataframe in the specified format with provenance column
|
|
69
|
+
- Merged metadata (Metadata object) or None if no metadata available
|
|
70
|
+
|
|
71
|
+
Notes
|
|
72
|
+
-----
|
|
73
|
+
- Data and metadata are merged independently (two-track architecture)
|
|
74
|
+
- If meta is None: uses metadata from SAV files (if any)
|
|
75
|
+
- If meta is provided: uses ONLY those metadata objects, ignoring SAV metadata
|
|
76
|
+
- The source column appears as the last column in the merged dataframe
|
|
77
|
+
- Metadata merge follows column-level preservation (base wins for existing columns)
|
|
78
|
+
- CSV and Excel files don't have metadata, but can still be merged for data
|
|
79
|
+
|
|
80
|
+
File Format Support
|
|
81
|
+
-------------------
|
|
82
|
+
- SAV/ZSAV: Full support with automatic metadata extraction
|
|
83
|
+
- CSV: Data only, no metadata
|
|
84
|
+
- Excel: Data only (reads first sheet), no metadata
|
|
85
|
+
* Supported extensions: .xlsx, .xls, .xlsm, .xlsb, .ods
|
|
86
|
+
|
|
87
|
+
Examples
|
|
88
|
+
--------
|
|
89
|
+
>>> # Merge SAV files with automatic metadata extraction
|
|
90
|
+
>>> data, meta = add_cases(["survey1.sav", "survey2.sav", "survey3.sav"])
|
|
91
|
+
|
|
92
|
+
>>> # Mix different file types (SAV with metadata, CSV/Excel without)
|
|
93
|
+
>>> data, meta = add_cases(["survey.sav", "additional_data.csv", "report.xlsx"])
|
|
94
|
+
|
|
95
|
+
>>> # Provide specific metadata objects (ignores SAV metadata)
|
|
96
|
+
>>> data, meta = add_cases(
|
|
97
|
+
... inputs=["data1.sav", "data2.csv", "data3.xlsx"],
|
|
98
|
+
... meta=[meta1, meta2] # Only these two will be merged
|
|
99
|
+
... )
|
|
100
|
+
|
|
101
|
+
>>> # Mix different input types
|
|
102
|
+
>>> df1 = pd.DataFrame({'Q1': [1, 2]})
|
|
103
|
+
>>> data, meta = add_cases([df1, "survey.sav", "data.csv", (df2, meta2)])
|
|
104
|
+
|
|
105
|
+
>>> # Single metadata for multiple files of any type
|
|
106
|
+
>>> data, meta = add_cases(
|
|
107
|
+
... inputs=["file1.sav", "file2.csv", "file3.xlsx"], # Mixed file types
|
|
108
|
+
... meta=[base_meta], # Just one metadata to use
|
|
109
|
+
... meta_strategy="first"
|
|
110
|
+
... )
|
|
111
|
+
|
|
112
|
+
>>> # Write merged result
|
|
113
|
+
>>> from tidyspss import write_sav
|
|
114
|
+
>>> write_sav(data, meta, "merged_output.sav")
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
if not inputs:
|
|
118
|
+
raise ValueError("inputs list cannot be empty")
|
|
119
|
+
|
|
120
|
+
# Separate data and metadata handling
|
|
121
|
+
dfs = []
|
|
122
|
+
metas_to_merge = []
|
|
123
|
+
|
|
124
|
+
# If meta parameter is provided, use ONLY those metadata objects
|
|
125
|
+
if meta is not None:
|
|
126
|
+
# User provided specific metadata - use only these
|
|
127
|
+
metas_to_merge = [Metadata(m) if m is not None and not isinstance(m, Metadata) else m
|
|
128
|
+
for m in meta]
|
|
129
|
+
logger.info(f"Using {len(meta)} provided metadata objects (ignoring any SAV metadata)")
|
|
130
|
+
|
|
131
|
+
# Process inputs for data extraction
|
|
132
|
+
for i, item in enumerate(inputs):
|
|
133
|
+
if isinstance(item, tuple) and len(item) == 2:
|
|
134
|
+
# It's a (dataframe, metadata) tuple
|
|
135
|
+
df, tuple_meta = item
|
|
136
|
+
dfs.append(df)
|
|
137
|
+
|
|
138
|
+
# Only use tuple metadata if meta parameter wasn't provided
|
|
139
|
+
if meta is None and tuple_meta is not None:
|
|
140
|
+
metas_to_merge.append(Metadata(tuple_meta) if not isinstance(tuple_meta, Metadata) else tuple_meta)
|
|
141
|
+
logger.debug(f"Using tuple metadata for input {i}")
|
|
142
|
+
|
|
143
|
+
elif isinstance(item, (str, Path)):
|
|
144
|
+
# It's a file path
|
|
145
|
+
file_path = Path(item)
|
|
146
|
+
|
|
147
|
+
if not file_path.exists():
|
|
148
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
149
|
+
|
|
150
|
+
ext = file_path.suffix.lower()
|
|
151
|
+
|
|
152
|
+
# Always pass file path to merge_data to preserve filename in source_col
|
|
153
|
+
dfs.append(str(file_path))
|
|
154
|
+
|
|
155
|
+
if ext in ['.sav', '.zsav']:
|
|
156
|
+
# SAV files - extract metadata separately if needed
|
|
157
|
+
# Only use SAV metadata if meta parameter wasn't provided
|
|
158
|
+
if meta is None:
|
|
159
|
+
_, meta_raw = read_sav(file_path, output_format="polars")
|
|
160
|
+
if meta_raw is not None:
|
|
161
|
+
metas_to_merge.append(Metadata(meta_raw))
|
|
162
|
+
logger.debug(f"Using SAV file metadata: {file_path.name}")
|
|
163
|
+
elif ext == '.csv':
|
|
164
|
+
# CSV files - no metadata available
|
|
165
|
+
logger.debug(f"Added CSV file: {file_path.name} (no metadata)")
|
|
166
|
+
elif ext in ['.xlsx', '.xls', '.xlsm', '.xlsb', '.ods']:
|
|
167
|
+
# Excel files - no metadata available
|
|
168
|
+
logger.debug(f"Added Excel file: {file_path.name} (no metadata)")
|
|
169
|
+
else:
|
|
170
|
+
# Other file types - log warning but try to process
|
|
171
|
+
logger.warning(f"Unknown file type: {ext} - will attempt to process: {file_path.name}")
|
|
172
|
+
|
|
173
|
+
else:
|
|
174
|
+
# It's a dataframe without metadata
|
|
175
|
+
dfs.append(item)
|
|
176
|
+
logger.debug(f"Added dataframe {i}")
|
|
177
|
+
|
|
178
|
+
# Log summary of inputs
|
|
179
|
+
logger.info(f"Processing {len(inputs)} inputs for data")
|
|
180
|
+
if meta is None:
|
|
181
|
+
logger.info(f"Found {len(metas_to_merge)} metadata objects from SAV files/tuples")
|
|
182
|
+
else:
|
|
183
|
+
logger.info(f"Using {len(metas_to_merge)} provided metadata objects")
|
|
184
|
+
|
|
185
|
+
# Count file types for logging
|
|
186
|
+
file_type_counts = {}
|
|
187
|
+
for item in inputs:
|
|
188
|
+
if isinstance(item, (str, Path)):
|
|
189
|
+
ext = Path(item).suffix.lower()
|
|
190
|
+
file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
|
|
191
|
+
|
|
192
|
+
if file_type_counts:
|
|
193
|
+
types_summary = ", ".join([f"{count} {ext}" for ext, count in file_type_counts.items()])
|
|
194
|
+
logger.info(f"File types: {types_summary}")
|
|
195
|
+
|
|
196
|
+
# Merge data using merge_data function
|
|
197
|
+
logger.info("Merging data...")
|
|
198
|
+
merged_data = merge_data(dfs, source_col=source_col, output_format=output_format)
|
|
199
|
+
|
|
200
|
+
# Merge metadata if any exists
|
|
201
|
+
merged_meta = None
|
|
202
|
+
if metas_to_merge and any(m is not None for m in metas_to_merge):
|
|
203
|
+
logger.info(f"Merging metadata with strategy='{meta_strategy}'...")
|
|
204
|
+
merged_meta = merge_meta(metas_to_merge, strategy=meta_strategy)
|
|
205
|
+
|
|
206
|
+
# Add label for the source column if not present
|
|
207
|
+
if merged_meta and source_col not in merged_meta.column_labels:
|
|
208
|
+
logger.debug(f"Adding label for source column '{source_col}'")
|
|
209
|
+
labels_update = {source_col: "Data Source"}
|
|
210
|
+
# Get existing labels and add new one
|
|
211
|
+
existing_labels = merged_meta.column_labels if merged_meta.column_labels else {}
|
|
212
|
+
merged_meta.column_labels = {**existing_labels, **labels_update}
|
|
213
|
+
|
|
214
|
+
# Set as nominal measure if not present
|
|
215
|
+
if source_col not in merged_meta.variable_measure:
|
|
216
|
+
measures_update = {source_col: "nominal"}
|
|
217
|
+
existing_measures = merged_meta.variable_measure if merged_meta.variable_measure else {}
|
|
218
|
+
merged_meta.variable_measure = {**existing_measures, **measures_update}
|
|
219
|
+
else:
|
|
220
|
+
logger.info("No metadata to merge (common when merging CSV/Excel files)")
|
|
221
|
+
|
|
222
|
+
# Log final summary
|
|
223
|
+
data_shape = merged_data.shape if hasattr(merged_data, 'shape') else "unknown"
|
|
224
|
+
meta_cols = len(merged_meta.column_labels) if merged_meta and merged_meta.column_labels else 0
|
|
225
|
+
logger.info(f"Merge complete: data shape {data_shape}, metadata for {meta_cols} columns")
|
|
226
|
+
|
|
227
|
+
return merged_data, merged_meta
|