jsonunwrap 0.2.1__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {jsonunwrap-0.2.1 → jsonunwrap-0.2.3}/PKG-INFO +11 -4
- {jsonunwrap-0.2.1 → jsonunwrap-0.2.3}/README.md +10 -3
- jsonunwrap-0.2.3/jsonunwrap/core.py +99 -0
- {jsonunwrap-0.2.1 → jsonunwrap-0.2.3}/jsonunwrap.egg-info/PKG-INFO +11 -4
- {jsonunwrap-0.2.1 → jsonunwrap-0.2.3}/pyproject.toml +1 -1
- jsonunwrap-0.2.1/jsonunwrap/core.py +0 -80
- {jsonunwrap-0.2.1 → jsonunwrap-0.2.3}/jsonunwrap/__init__.py +0 -0
- {jsonunwrap-0.2.1 → jsonunwrap-0.2.3}/jsonunwrap.egg-info/SOURCES.txt +0 -0
- {jsonunwrap-0.2.1 → jsonunwrap-0.2.3}/jsonunwrap.egg-info/dependency_links.txt +0 -0
- {jsonunwrap-0.2.1 → jsonunwrap-0.2.3}/jsonunwrap.egg-info/requires.txt +0 -0
- {jsonunwrap-0.2.1 → jsonunwrap-0.2.3}/jsonunwrap.egg-info/top_level.txt +0 -0
- {jsonunwrap-0.2.1 → jsonunwrap-0.2.3}/setup.cfg +0 -0
- {jsonunwrap-0.2.1 → jsonunwrap-0.2.3}/tests/test_core.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: jsonunwrap
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: A small python package that unpacks data from a JSON url and converts it into a csv file.
|
|
5
5
|
Author-email: njuedominic <njuemugodominic@gmail.com>
|
|
6
6
|
Project-URL: Homepage, https://github.com/njuedominic/json-unwrap
|
|
@@ -19,13 +19,20 @@ Requires-Dist: pandas>=2.0.0
|
|
|
19
19
|
|
|
20
20
|
```python
|
|
21
21
|
>>> import jsonunwrap as ju
|
|
22
|
-
>>> url = "https://dummyjson.com/
|
|
22
|
+
>>> url = "https://dummyjson.com/products"
|
|
23
23
|
>>> fetchdata = ju.fetch_json(url)
|
|
24
24
|
>>> df = ju.unwrap_data(fetchdata)
|
|
25
25
|
>>> df.columns
|
|
26
|
-
Index(['id', '
|
|
26
|
+
Index(['id', 'title', 'description', 'category', 'price', 'discountPercentage',
|
|
27
|
+
'rating', 'stock', 'tags', 'brand', 'sku', 'weight',
|
|
28
|
+
'warrantyInformation', 'shippingInformation', 'availabilityStatus',
|
|
29
|
+
'returnPolicy', 'minimumOrderQuantity', 'images', 'thumbnail',
|
|
30
|
+
'dimensions.width', 'dimensions.height', 'dimensions.depth',
|
|
31
|
+
'meta.createdAt', 'meta.updatedAt', 'meta.barcode', 'meta.qrCode',
|
|
32
|
+
'rating_reviews', 'comment', 'date', 'reviewerName', 'reviewerEmail'],
|
|
33
|
+
dtype='object')
|
|
27
34
|
>>> len(df)
|
|
28
|
-
|
|
35
|
+
3771
|
|
29
36
|
```
|
|
30
37
|
|
|
31
38
|
jsonunwrap allows you to deeply normalize complex, semi-structured nested JSON data into clean pandas DataFrames extremely easily.
|
|
@@ -4,13 +4,20 @@
|
|
|
4
4
|
|
|
5
5
|
```python
|
|
6
6
|
>>> import jsonunwrap as ju
|
|
7
|
-
>>> url = "https://dummyjson.com/
|
|
7
|
+
>>> url = "https://dummyjson.com/products"
|
|
8
8
|
>>> fetchdata = ju.fetch_json(url)
|
|
9
9
|
>>> df = ju.unwrap_data(fetchdata)
|
|
10
10
|
>>> df.columns
|
|
11
|
-
Index(['id', '
|
|
11
|
+
Index(['id', 'title', 'description', 'category', 'price', 'discountPercentage',
|
|
12
|
+
'rating', 'stock', 'tags', 'brand', 'sku', 'weight',
|
|
13
|
+
'warrantyInformation', 'shippingInformation', 'availabilityStatus',
|
|
14
|
+
'returnPolicy', 'minimumOrderQuantity', 'images', 'thumbnail',
|
|
15
|
+
'dimensions.width', 'dimensions.height', 'dimensions.depth',
|
|
16
|
+
'meta.createdAt', 'meta.updatedAt', 'meta.barcode', 'meta.qrCode',
|
|
17
|
+
'rating_reviews', 'comment', 'date', 'reviewerName', 'reviewerEmail'],
|
|
18
|
+
dtype='object')
|
|
12
19
|
>>> len(df)
|
|
13
|
-
|
|
20
|
+
3771
|
|
14
21
|
```
|
|
15
22
|
|
|
16
23
|
jsonunwrap allows you to deeply normalize complex, semi-structured nested JSON data into clean pandas DataFrames extremely easily.
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This module contains the core functionality for the json_unwrap package.
|
|
3
|
+
The main function is `json_to_csv`, which converts a JSON file to a CSV file.
|
|
4
|
+
"""
|
|
5
|
+
import os
|
|
6
|
+
from typing import Any, Dict, List, Union
|
|
7
|
+
import pandas as pd
|
|
8
|
+
import requests
|
|
9
|
+
|
|
10
|
+
def unwrap_data(data: Union[Dict[str, Any], List[Any]]) -> pd.DataFrame:
|
|
11
|
+
"""
|
|
12
|
+
Normalizes and deeply flattens semi-structured JSON data into a pandas DataFrame.
|
|
13
|
+
"""
|
|
14
|
+
# 1. Ensure we start with a clean record list
|
|
15
|
+
if isinstance(data, dict):
|
|
16
|
+
# Handle cases where the data is inside a wrapper key (like {"products": [...]})
|
|
17
|
+
list_keys = [k for k, v in data.items() if isinstance(v, list)]
|
|
18
|
+
if list_keys and len(data) <= 4:
|
|
19
|
+
main_data = data[list_keys[0]]
|
|
20
|
+
else:
|
|
21
|
+
main_data = [data]
|
|
22
|
+
else:
|
|
23
|
+
main_data = data
|
|
24
|
+
|
|
25
|
+
# 2. Base normalization
|
|
26
|
+
df = pd.json_normalize(main_data)
|
|
27
|
+
|
|
28
|
+
# 3. Clean Linear Pass: Avoid infinite loops by tracking column states directly
|
|
29
|
+
columns_to_process = list(df.columns)
|
|
30
|
+
|
|
31
|
+
while columns_to_process:
|
|
32
|
+
col = columns_to_process.pop(0)
|
|
33
|
+
|
|
34
|
+
# Guard check if the column was dropped in a previous iteration
|
|
35
|
+
if col not in df.columns:
|
|
36
|
+
continue
|
|
37
|
+
|
|
38
|
+
non_null_vals = df[col].dropna()
|
|
39
|
+
if non_null_vals.empty:
|
|
40
|
+
continue
|
|
41
|
+
|
|
42
|
+
# Check for nested dictionaries
|
|
43
|
+
if any(isinstance(val, dict) for val in non_null_vals):
|
|
44
|
+
nested_df = pd.json_normalize(non_null_vals).set_index(non_null_vals.index)
|
|
45
|
+
# Add new sub-columns back into the processing queue
|
|
46
|
+
new_cols = [f"{c}_{col}" for c in nested_df.columns]
|
|
47
|
+
df = df.drop(columns=[col]).join(nested_df, rsuffix=f"_{col}")
|
|
48
|
+
columns_to_process.extend(new_cols)
|
|
49
|
+
|
|
50
|
+
# Check for nested lists (But do not loop back if it's just raw strings/ints)
|
|
51
|
+
elif any(isinstance(val, list) for val in non_null_vals):
|
|
52
|
+
# Check if the list contains dictionaries before exploding heavily
|
|
53
|
+
first_list = next((v for v in non_null_vals if isinstance(v, list) and v), None)
|
|
54
|
+
|
|
55
|
+
df = df.explode(col)
|
|
56
|
+
|
|
57
|
+
# If the inner elements were dictionaries, we need to flatten them on the next pass
|
|
58
|
+
if first_list and isinstance(first_list[0], dict):
|
|
59
|
+
columns_to_process.append(col)
|
|
60
|
+
|
|
61
|
+
return df
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def fetch_json(url: str, **kwargs: Any) -> Union[Dict[str, Any], List[Any]]:
|
|
65
|
+
"""
|
|
66
|
+
Fetches raw JSON data from a URL.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
url: The endpoint web target.
|
|
70
|
+
**kwargs: Additional arguments passed directly to requests.get (e.g., headers, auth).
|
|
71
|
+
"""
|
|
72
|
+
response = requests.get(url, **kwargs)
|
|
73
|
+
response.raise_for_status()
|
|
74
|
+
return response.json()
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def json_to_csv(url: str, output_path: str) -> pd.DataFrame:
|
|
78
|
+
"""
|
|
79
|
+
Fetches JSON from a URL, deeply flattens it, and saves it directly to a CSV file.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
url: The endpoint URL containing the target JSON data.
|
|
83
|
+
output_path: Target filesystem path where the CSV will be written.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
The generated pandas DataFrame.
|
|
87
|
+
"""
|
|
88
|
+
# Ensure the parent output directory exists safely
|
|
89
|
+
directory = os.path.dirname(output_path)
|
|
90
|
+
if directory:
|
|
91
|
+
os.makedirs(directory, exist_ok=True)
|
|
92
|
+
|
|
93
|
+
raw_data = fetch_json(url)
|
|
94
|
+
df = unwrap_data(raw_data)
|
|
95
|
+
|
|
96
|
+
df.to_csv(output_path, index=False)
|
|
97
|
+
return df
|
|
98
|
+
|
|
99
|
+
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: jsonunwrap
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: A small python package that unpacks data from a JSON url and converts it into a csv file.
|
|
5
5
|
Author-email: njuedominic <njuemugodominic@gmail.com>
|
|
6
6
|
Project-URL: Homepage, https://github.com/njuedominic/json-unwrap
|
|
@@ -19,13 +19,20 @@ Requires-Dist: pandas>=2.0.0
|
|
|
19
19
|
|
|
20
20
|
```python
|
|
21
21
|
>>> import jsonunwrap as ju
|
|
22
|
-
>>> url = "https://dummyjson.com/
|
|
22
|
+
>>> url = "https://dummyjson.com/products"
|
|
23
23
|
>>> fetchdata = ju.fetch_json(url)
|
|
24
24
|
>>> df = ju.unwrap_data(fetchdata)
|
|
25
25
|
>>> df.columns
|
|
26
|
-
Index(['id', '
|
|
26
|
+
Index(['id', 'title', 'description', 'category', 'price', 'discountPercentage',
|
|
27
|
+
'rating', 'stock', 'tags', 'brand', 'sku', 'weight',
|
|
28
|
+
'warrantyInformation', 'shippingInformation', 'availabilityStatus',
|
|
29
|
+
'returnPolicy', 'minimumOrderQuantity', 'images', 'thumbnail',
|
|
30
|
+
'dimensions.width', 'dimensions.height', 'dimensions.depth',
|
|
31
|
+
'meta.createdAt', 'meta.updatedAt', 'meta.barcode', 'meta.qrCode',
|
|
32
|
+
'rating_reviews', 'comment', 'date', 'reviewerName', 'reviewerEmail'],
|
|
33
|
+
dtype='object')
|
|
27
34
|
>>> len(df)
|
|
28
|
-
|
|
35
|
+
3771
|
|
29
36
|
```
|
|
30
37
|
|
|
31
38
|
jsonunwrap allows you to deeply normalize complex, semi-structured nested JSON data into clean pandas DataFrames extremely easily.
|
|
@@ -1,80 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
This module contains the core functionality for the json_unwrap package.
|
|
3
|
-
The main function is `json_to_csv`, which converts a JSON file to a CSV file.
|
|
4
|
-
"""
|
|
5
|
-
import os
|
|
6
|
-
from typing import Any, Dict, List, Union
|
|
7
|
-
import pandas as pd
|
|
8
|
-
import requests
|
|
9
|
-
|
|
10
|
-
def unwrap_data(data: Union[Dict[str, Any], List[Any]]) -> pd.DataFrame:
|
|
11
|
-
"""
|
|
12
|
-
Normalizes and deeply flattens semi-structured JSON data into a pandas DataFrame.
|
|
13
|
-
"""
|
|
14
|
-
if isinstance(data, dict):
|
|
15
|
-
main_data = [data]
|
|
16
|
-
else:
|
|
17
|
-
main_data = data
|
|
18
|
-
|
|
19
|
-
df = pd.json_normalize(main_data)
|
|
20
|
-
|
|
21
|
-
# Professional Fix: Track already exploded primitive columns to prevent loops
|
|
22
|
-
exploded_columns = set()
|
|
23
|
-
|
|
24
|
-
changed = True
|
|
25
|
-
while changed:
|
|
26
|
-
changed = False
|
|
27
|
-
for col in list(df.columns):
|
|
28
|
-
# 1. Explode lists (Only if we haven't exploded this exact column name yet)
|
|
29
|
-
if col not in exploded_columns and any(isinstance(val, list) for val in df[col].dropna()):
|
|
30
|
-
df = df.explode(col)
|
|
31
|
-
exploded_columns.add(col) # Mark as done so we don't repeat it!
|
|
32
|
-
changed = True
|
|
33
|
-
break # Refresh columns list
|
|
34
|
-
|
|
35
|
-
# 2. Normalize and merge nested dictionaries
|
|
36
|
-
if any(isinstance(val, dict) for val in df[col].dropna()):
|
|
37
|
-
nested_df = pd.json_normalize(df[col]).set_index(df.index)
|
|
38
|
-
df = df.drop(columns=[col]).join(nested_df, rsuffix=f"_{col}")
|
|
39
|
-
changed = True
|
|
40
|
-
break
|
|
41
|
-
|
|
42
|
-
return df
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
def fetch_json(url: str, **kwargs: Any) -> Union[Dict[str, Any], List[Any]]:
|
|
46
|
-
"""
|
|
47
|
-
Fetches raw JSON data from a URL.
|
|
48
|
-
|
|
49
|
-
Args:
|
|
50
|
-
url: The endpoint web target.
|
|
51
|
-
**kwargs: Additional arguments passed directly to requests.get (e.g., headers, auth).
|
|
52
|
-
"""
|
|
53
|
-
response = requests.get(url, **kwargs)
|
|
54
|
-
response.raise_for_status()
|
|
55
|
-
return response.json()
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def json_to_csv(url: str, output_path: str) -> pd.DataFrame:
|
|
59
|
-
"""
|
|
60
|
-
Fetches JSON from a URL, deeply flattens it, and saves it directly to a CSV file.
|
|
61
|
-
|
|
62
|
-
Args:
|
|
63
|
-
url: The endpoint URL containing the target JSON data.
|
|
64
|
-
output_path: Target filesystem path where the CSV will be written.
|
|
65
|
-
|
|
66
|
-
Returns:
|
|
67
|
-
The generated pandas DataFrame.
|
|
68
|
-
"""
|
|
69
|
-
# Ensure the parent output directory exists safely
|
|
70
|
-
directory = os.path.dirname(output_path)
|
|
71
|
-
if directory:
|
|
72
|
-
os.makedirs(directory, exist_ok=True)
|
|
73
|
-
|
|
74
|
-
raw_data = fetch_json(url)
|
|
75
|
-
df = unwrap_data(raw_data)
|
|
76
|
-
|
|
77
|
-
df.to_csv(output_path, index=False)
|
|
78
|
-
return df
|
|
79
|
-
|
|
80
|
-
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|