parseimagenet 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,162 @@
1
+ from pathlib import Path
2
+ from collections import defaultdict
3
+ import random
4
+ import argparse
5
+ import re
6
+
7
+ # keywords
8
+ try:
9
+ from .keywords.birds import bird_keywords
10
+ except ImportError:
11
+ from keywords.birds import bird_keywords
12
+
13
+ # Registry of predefined keyword presets
14
+ KEYWORD_PRESETS = {
15
+ "birds": bird_keywords,
16
+ }
17
+
18
+ def get_available_presets():
19
+ """Return list of available preset names."""
20
+ return list(KEYWORD_PRESETS.keys())
21
+
22
+ def get_image_paths_by_keywords(base_path, preset="birds", keywords=None, num_images=200):
23
+ """
24
+ Extract file paths for images matching specified keywords.
25
+
26
+ Args:
27
+ base_path: Path to ImageNet-Subset directory
28
+ preset: Name of predefined keyword list (default: "birds").
29
+ Available presets: "birds". Use get_available_presets() to see all.
30
+ keywords: Custom list of keywords. If provided, overrides preset.
31
+ num_images: Number of random images to extract (default: 200)
32
+
33
+ Returns:
34
+ List of Path objects to the selected images
35
+ """
36
+ # Determine which keywords to use
37
+ if keywords is not None:
38
+ # Custom keywords provided - validate it's a list
39
+ if isinstance(keywords, str):
40
+ raise TypeError("keywords must be a list of strings, not a single string. Use keywords=['your_keyword'] instead.")
41
+ if not isinstance(keywords, list):
42
+ raise TypeError("keywords must be a list of strings.")
43
+ search_keywords = keywords
44
+ else:
45
+ # Use preset
46
+ if preset not in KEYWORD_PRESETS:
47
+ available = get_available_presets()
48
+ raise ValueError(f"Unknown preset '{preset}'. Available presets: {available}")
49
+ search_keywords = KEYWORD_PRESETS[preset]
50
+
51
+ train_annotations = base_path / "ILSVRC" / "ImageSets" / "CLS-LOC" / "train_cls.txt"
52
+ synset_mapping_file = base_path / "LOC_synset_mapping.txt"
53
+ data_path = base_path / "ILSVRC" / "Data" / "CLS-LOC" / "train"
54
+
55
+ # Load synset mapping (wnid -> category names)
56
+ print("")
57
+ print("Loading category names...")
58
+ synset_mapping = {}
59
+ with open(synset_mapping_file, 'r') as f:
60
+ for line in f:
61
+ parts = line.strip().split(maxsplit=1)
62
+ if len(parts) == 2:
63
+ wnid = parts[0]
64
+ category_name = parts[1]
65
+ synset_mapping[wnid] = category_name
66
+
67
+ print(f"Loaded {len(synset_mapping)} categories\n")
68
+
69
+ # Parse training annotations
70
+ print("Parsing training annotations...")
71
+ category_images = defaultdict(list)
72
+
73
+ with open(train_annotations, 'r') as f:
74
+ for line in f:
75
+ parts = line.strip().split()
76
+ if len(parts) >= 1:
77
+ image_path = parts[0] # e.g., "n01440764/n01440764_10026"
78
+ wnid = image_path.split('/')[0] # Extract the wnid
79
+ category_images[wnid].append(image_path)
80
+
81
+ print(f"Found {len(category_images)} unique categories\n")
82
+
83
+ # Find matching categories
84
+ print("=" * 80)
85
+ print(f"SEARCHING WITH KEYWORDS: {search_keywords}")
86
+ print("=" * 80)
87
+
88
+ matching_wnids = []
89
+ for wnid, category_name in synset_mapping.items():
90
+ if any(re.search(rf'\b{re.escape(keyword)}\b', category_name, re.IGNORECASE) for keyword in search_keywords):
91
+ if wnid in category_images:
92
+ matching_wnids.append(wnid)
93
+ count = len(category_images[wnid])
94
+ print(f"{wnid}: {category_name} ({count} images)")
95
+
96
+ print(f"\n{'=' * 80}")
97
+ print(f"Total matching categories: {len(matching_wnids)}")
98
+ print(f"{'=' * 80}\n")
99
+
100
+ # Collect all matching images
101
+ all_matching_images = []
102
+ for wnid in matching_wnids:
103
+ all_matching_images.extend(category_images[wnid])
104
+
105
+ print(f"Total matching images available: {len(all_matching_images)}")
106
+
107
+ # Select random images
108
+ if len(all_matching_images) > 0:
109
+ num_to_select = min(num_images, len(all_matching_images))
110
+ selected_image_paths = random.sample(all_matching_images, num_to_select)
111
+
112
+ # Convert to full file paths
113
+ full_paths = []
114
+ for img_path in selected_image_paths:
115
+ full_path = data_path / f"{img_path}.JPEG"
116
+ full_paths.append(full_path)
117
+
118
+ print(f"\nSelected {num_to_select} random images")
119
+
120
+ # Verify how many actually exist
121
+ existing = sum(1 for p in full_paths if p.exists())
122
+ print(f"Verified {existing}/{num_to_select} files exist on disk\n")
123
+
124
+ return full_paths
125
+ else:
126
+ print("\nNo matching images found!")
127
+ return []
128
+
129
+
130
+ def main():
131
+ # Set up argument parser
132
+ parser = argparse.ArgumentParser(description='Extract ImageNet image paths by category keywords')
133
+ parser.add_argument('--num_images', type=int, default=200,
134
+ help='Number of random images to extract (default: 200)')
135
+ parser.add_argument('--preset', type=str, default='birds',
136
+ help=f'Predefined keyword preset (default: birds). Available: {get_available_presets()}')
137
+ parser.add_argument('--keywords', nargs='+', default=None,
138
+ help='Custom keywords to match in category names (overrides --preset)')
139
+ parser.add_argument('--base_path', type=str,
140
+ default='/Users/mrt/Documents/MrT/code/computer-vision/image-bank/ImageNet-Subset',
141
+ help='Path to ImageNet-Subset directory')
142
+
143
+ args = parser.parse_args()
144
+
145
+ base_path = Path(args.base_path)
146
+
147
+ # Extract image paths
148
+ image_paths = get_image_paths_by_keywords(
149
+ base_path,
150
+ preset=args.preset,
151
+ keywords=args.keywords,
152
+ num_images=args.num_images
153
+ )
154
+
155
+ # Print first 10 paths as example
156
+ if image_paths:
157
+ print("\nFirst 10 image paths:")
158
+ for i, path in enumerate(image_paths[:10], 1):
159
+ print(f"{i}. {path}")
160
+
161
+ if __name__ == "__main__":
162
+ main()
@@ -0,0 +1,4 @@
1
+ from .ParseImageNetSubset import get_image_paths_by_keywords, get_available_presets, KEYWORD_PRESETS
2
+ from .keywords.birds import bird_keywords
3
+
4
+ __all__ = ['get_image_paths_by_keywords', 'get_available_presets', 'KEYWORD_PRESETS', 'bird_keywords']
@@ -0,0 +1,3 @@
1
+ from .birds import bird_keywords
2
+
3
+ __all__ = ['bird_keywords']
@@ -0,0 +1,12 @@
1
+ bird_keywords = [
2
+ 'bird', 'finch', 'robin', 'eagle', 'hawk', 'owl', 'sparrow',
3
+ 'warbler', 'hummingbird', 'jay', 'cardinal', 'chickadee',
4
+ 'duck', 'goose', 'swan', 'crane', 'heron', 'pelican',
5
+ 'parrot', 'cockatoo', 'macaw', 'penguin', 'ostrich',
6
+ 'brambling', 'bunting', 'junco', 'oriole', 'magpie',
7
+ 'cock', 'hen', 'rooster', 'chicken', 'grouse', 'partridge',
8
+ 'quail', 'pheasant', 'peacock', 'flamingo', 'stork',
9
+ 'albatross', 'cormorant', 'kingfisher', 'hornbill',
10
+ 'toucan', 'woodpecker', 'flycatcher', 'shrike', 'vireo',
11
+ 'wren', 'thrush', 'mockingbird', 'starling', 'pipit'
12
+ ]
@@ -0,0 +1,139 @@
1
+ Metadata-Version: 2.4
2
+ Name: parseimagenet
3
+ Version: 1.0.3
4
+ Summary: Extract ImageNet image paths by category keywords
5
+ Author-email: Reed Turgeon <turgeon.dev@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/MrT3313/Parse-ImageNet
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Requires-Python: >=3.8
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Provides-Extra: dev
14
+ Requires-Dist: jupyter; extra == "dev"
15
+ Requires-Dist: ipykernel; extra == "dev"
16
+ Dynamic: license-file
17
+
18
+ # ParseImageNet
19
+
20
+ Extract image file paths from ImageNet by matching category keywords. Useful for creating custom subsets of ImageNet for training or evaluation.
21
+
22
+ Python Version,https://img.shields.io/pypi/pyversions/parseimagenet
23
+ License,https://img.shields.io/github/license/MrT3313/Parse-ImageNet
24
+ Build Status,https://github.com/MrT3313/Parse-ImageNet/actions/workflows/main.yml/badge.svg
25
+
26
+ ## [Kaggle Dataset](https://www.kaggle.com/competitions/imagenet-object-localization-challenge/data)
27
+
28
+ ## Prerequisites
29
+
30
+ - Python 3.8+
31
+ - ImageNet dataset (or a subset) with the standard ILSVRC directory structure:
32
+ ```
33
+ ImageNet-Subset/
34
+ ├── LOC_synset_mapping.txt
35
+ └── ILSVRC/
36
+ ├── ImageSets/
37
+ │ └── CLS-LOC/
38
+ │ └── train_cls.txt
39
+ └── Data/
40
+ └── CLS-LOC/
41
+ └── train/
42
+ ├── n01440764/
43
+ │ ├── n01440764_10026.JPEG
44
+ │ └── ...
45
+ └── ...
46
+ ```
47
+
48
+ ## Installation
49
+
50
+ Clone the repository:
51
+
52
+ ```bash
53
+ git clone https://github.com/MrT3313/Parse-ImageNet.git
54
+ ```
55
+
56
+ Then install the package into the environment where you run Jupyter:
57
+
58
+ ```bash
59
+ # Using pip
60
+ pip install -e /path/to/ParseImageNet
61
+ # ex: pip install -e /Users/mrt/Documents/MrT/code/computer-vision/ParseImageNet
62
+ ```
63
+
64
+ The `-e` flag installs in "editable" mode, so code changes are immediately available without reinstalling. However, changes to package metadata (version, dependencies) in `pyproject.toml` still require running `pip install -e .` again.
65
+
66
+ ## Usage
67
+
68
+ > [!NOTE]
69
+ >
70
+ > [Example Notebook](/DOCS/ExampleNotebook.ipynb)
71
+
72
+ ### In Jupyter Lab / Jupyter Notebook
73
+
74
+ ```python
75
+ from pathlib import Path
76
+ from parseimagenet import get_image_paths_by_keywords
77
+
78
+ # Set the path to your ImageNet directory
79
+ base_path = Path('/path/to/your/ImageNet-Subset')
80
+ # ex: /Users/mrt/Documents/MrT/code/computer-vision/image-bank/ImageNet-Subset
81
+
82
+ # Use the default "birds" preset
83
+ image_paths = get_image_paths_by_keywords(base_path=base_path)
84
+
85
+ # image_paths is a list of Path objects
86
+ print(f"Found {len(image_paths)} images")
87
+ print(image_paths[:5])
88
+ ```
89
+
90
+ #### Using Preset Keywords
91
+
92
+ Presets are predefined keyword lists for common categories:
93
+
94
+ ```python
95
+ from parseimagenet import get_image_paths_by_keywords # main function
96
+ from parseimagenet import get_available_presets, KEYWORD_PRESETS # helpers
97
+
98
+ # See available presets
99
+ print(get_available_presets()) # ['birds']
100
+
101
+ # Use a specific preset
102
+ image_paths = get_image_paths_by_keywords(
103
+ base_path=base_path,
104
+ preset="birds",
105
+ num_images=200
106
+ )
107
+
108
+ # Access preset keywords directly
109
+ print(KEYWORD_PRESETS["birds"])
110
+ ```
111
+
112
+ #### Using Custom Keywords
113
+
114
+ Custom keywords override the preset:
115
+
116
+ ```python
117
+ image_paths = get_image_paths_by_keywords(
118
+ base_path=base_path,
119
+ keywords=['dog', 'puppy', 'hound'],
120
+ num_images=100
121
+ )
122
+ ```
123
+
124
+ > [!NOTE]
125
+ >
126
+ > you can find all applicable categories in the `LOC_synset_mapping.txt` file
127
+
128
+ ### Command Line
129
+
130
+ ```bash
131
+ # Use default preset (birds)
132
+ python -m parseimagenet.ParseImageNetSubset --base_path /path/to/ImageNet-Subset
133
+
134
+ # Use a specific preset
135
+ python -m parseimagenet.ParseImageNetSubset --base_path /path/to/ImageNet-Subset --preset birds --num_images 100
136
+
137
+ # Use custom keywords (overrides preset)
138
+ python -m parseimagenet.ParseImageNetSubset --base_path /path/to/ImageNet-Subset --keywords dog puppy --num_images 100
139
+ ```
@@ -0,0 +1,9 @@
1
+ parseimagenet/ParseImageNetSubset.py,sha256=0Labff37IR4rBch6pXEHBNTkX268SHri9BFyu7I0lI0,5989
2
+ parseimagenet/__init__.py,sha256=SYt4M0OvrtFHkMQu3hStVG0BeujmCg3wTEVJAy3J3ZA,247
3
+ parseimagenet/keywords/__init__.py,sha256=40hcsiPwbrHldKSdxVj4OF_7Db4iLu5VFgTn3o272oI,62
4
+ parseimagenet/keywords/birds.py,sha256=0HgdV-pxVOl3gKXk4ilWuqrJvkpP_NTVtzWwBDTahKQ,614
5
+ parseimagenet-1.0.3.dist-info/licenses/LICENSE,sha256=KRcBX6PTDBCujdtWzTfiXjJD61rSazBviMBMEm--GgA,1059
6
+ parseimagenet-1.0.3.dist-info/METADATA,sha256=cqdJTnC5PZPFDKtu3NyRYj0zEuhHp2EnNpiB1zBD-_s,3991
7
+ parseimagenet-1.0.3.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
8
+ parseimagenet-1.0.3.dist-info/top_level.txt,sha256=wjb0cdg_P23rfxPx9ZBLbxB_TwRSmXSMxLUFUeUootY,14
9
+ parseimagenet-1.0.3.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.10.2)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,7 @@
1
+ Copyright 2026 Reed Turgeon
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1 @@
1
+ parseimagenet