PyPI - parseimagenet - Versions diffs - 1.0.3__py3-none-any.whl - Mend

parseimagenet 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

parseimagenet/ParseImageNetSubset.py +162 -0
parseimagenet/__init__.py +4 -0
parseimagenet/keywords/__init__.py +3 -0
parseimagenet/keywords/birds.py +12 -0
parseimagenet-1.0.3.dist-info/METADATA +139 -0
parseimagenet-1.0.3.dist-info/RECORD +9 -0
parseimagenet-1.0.3.dist-info/WHEEL +5 -0
parseimagenet-1.0.3.dist-info/licenses/LICENSE +7 -0
parseimagenet-1.0.3.dist-info/top_level.txt +1 -0

parseimagenet/ParseImageNetSubset.py ADDED Viewed

@@ -0,0 +1,162 @@
+from pathlib import Path
+from collections import defaultdict
+import random
+import argparse
+import re
+# keywords
+try:
+    from .keywords.birds import bird_keywords
+except ImportError:
+    from keywords.birds import bird_keywords
+# Registry of predefined keyword presets
+KEYWORD_PRESETS = {
+    "birds": bird_keywords,
+}
+def get_available_presets():
+    """Return list of available preset names."""
+    return list(KEYWORD_PRESETS.keys())
+def get_image_paths_by_keywords(base_path, preset="birds", keywords=None, num_images=200):
+    """
+    Extract file paths for images matching specified keywords.
+    Args:
+        base_path: Path to ImageNet-Subset directory
+        preset: Name of predefined keyword list (default: "birds").
+                Available presets: "birds". Use get_available_presets() to see all.
+        keywords: Custom list of keywords. If provided, overrides preset.
+        num_images: Number of random images to extract (default: 200)
+    Returns:
+        List of Path objects to the selected images
+    """
+    # Determine which keywords to use
+    if keywords is not None:
+        # Custom keywords provided - validate it's a list
+        if isinstance(keywords, str):
+            raise TypeError("keywords must be a list of strings, not a single string. Use keywords=['your_keyword'] instead.")
+        if not isinstance(keywords, list):
+            raise TypeError("keywords must be a list of strings.")
+        search_keywords = keywords
+    else:
+        # Use preset
+        if preset not in KEYWORD_PRESETS:
+            available = get_available_presets()
+            raise ValueError(f"Unknown preset '{preset}'. Available presets: {available}")
+        search_keywords = KEYWORD_PRESETS[preset]
+    train_annotations = base_path / "ILSVRC" / "ImageSets" / "CLS-LOC" / "train_cls.txt"
+    synset_mapping_file = base_path / "LOC_synset_mapping.txt"
+    data_path = base_path / "ILSVRC" / "Data" / "CLS-LOC" / "train"
+    # Load synset mapping (wnid -> category names)
+    print("")
+    print("Loading category names...")
+    synset_mapping = {}
+    with open(synset_mapping_file, 'r') as f:
+        for line in f:
+            parts = line.strip().split(maxsplit=1)
+            if len(parts) == 2:
+                wnid = parts[0]
+                category_name = parts[1]
+                synset_mapping[wnid] = category_name
+    print(f"Loaded {len(synset_mapping)} categories\n")
+    # Parse training annotations
+    print("Parsing training annotations...")
+    category_images = defaultdict(list)
+    with open(train_annotations, 'r') as f:
+        for line in f:
+            parts = line.strip().split()
+            if len(parts) >= 1:
+                image_path = parts[0]  # e.g., "n01440764/n01440764_10026"
+                wnid = image_path.split('/')[0]  # Extract the wnid
+                category_images[wnid].append(image_path)
+    print(f"Found {len(category_images)} unique categories\n")
+    # Find matching categories
+    print("=" * 80)
+    print(f"SEARCHING WITH KEYWORDS: {search_keywords}")
+    print("=" * 80)
+    matching_wnids = []
+    for wnid, category_name in synset_mapping.items():
+        if any(re.search(rf'\b{re.escape(keyword)}\b', category_name, re.IGNORECASE) for keyword in search_keywords):
+            if wnid in category_images:
+                matching_wnids.append(wnid)
+                count = len(category_images[wnid])
+                print(f"{wnid}: {category_name} ({count} images)")
+    print(f"\n{'=' * 80}")
+    print(f"Total matching categories: {len(matching_wnids)}")
+    print(f"{'=' * 80}\n")
+    # Collect all matching images
+    all_matching_images = []
+    for wnid in matching_wnids:
+        all_matching_images.extend(category_images[wnid])
+    print(f"Total matching images available: {len(all_matching_images)}")
+    # Select random images
+    if len(all_matching_images) > 0:
+        num_to_select = min(num_images, len(all_matching_images))
+        selected_image_paths = random.sample(all_matching_images, num_to_select)
+        # Convert to full file paths
+        full_paths = []
+        for img_path in selected_image_paths:
+            full_path = data_path / f"{img_path}.JPEG"
+            full_paths.append(full_path)
+        print(f"\nSelected {num_to_select} random images")
+        # Verify how many actually exist
+        existing = sum(1 for p in full_paths if p.exists())
+        print(f"Verified {existing}/{num_to_select} files exist on disk\n")
+        return full_paths
+    else:
+        print("\nNo matching images found!")
+        return []
+def main():
+    # Set up argument parser
+    parser = argparse.ArgumentParser(description='Extract ImageNet image paths by category keywords')
+    parser.add_argument('--num_images', type=int, default=200,
+                        help='Number of random images to extract (default: 200)')
+    parser.add_argument('--preset', type=str, default='birds',
+                        help=f'Predefined keyword preset (default: birds). Available: {get_available_presets()}')
+    parser.add_argument('--keywords', nargs='+', default=None,
+                        help='Custom keywords to match in category names (overrides --preset)')
+    parser.add_argument('--base_path', type=str,
+                        default='/Users/mrt/Documents/MrT/code/computer-vision/image-bank/ImageNet-Subset',
+                        help='Path to ImageNet-Subset directory')
+    args = parser.parse_args()
+    base_path = Path(args.base_path)
+    # Extract image paths
+    image_paths = get_image_paths_by_keywords(
+        base_path,
+        preset=args.preset,
+        keywords=args.keywords,
+        num_images=args.num_images
+    )
+    # Print first 10 paths as example
+    if image_paths:
+        print("\nFirst 10 image paths:")
+        for i, path in enumerate(image_paths[:10], 1):
+            print(f"{i}. {path}")
+if __name__ == "__main__":
+    main()

parseimagenet/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .ParseImageNetSubset import get_image_paths_by_keywords, get_available_presets, KEYWORD_PRESETS
+from .keywords.birds import bird_keywords
+__all__ = ['get_image_paths_by_keywords', 'get_available_presets', 'KEYWORD_PRESETS', 'bird_keywords']

parseimagenet/keywords/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .birds import bird_keywords
+__all__ = ['bird_keywords']

parseimagenet/keywords/birds.py ADDED Viewed

@@ -0,0 +1,12 @@
+bird_keywords = [
+    'bird', 'finch', 'robin', 'eagle', 'hawk', 'owl', 'sparrow',
+    'warbler', 'hummingbird', 'jay', 'cardinal', 'chickadee',
+    'duck', 'goose', 'swan', 'crane', 'heron', 'pelican',
+    'parrot', 'cockatoo', 'macaw', 'penguin', 'ostrich',
+    'brambling', 'bunting', 'junco', 'oriole', 'magpie',
+    'cock', 'hen', 'rooster', 'chicken', 'grouse', 'partridge',
+    'quail', 'pheasant', 'peacock', 'flamingo', 'stork',
+    'albatross', 'cormorant', 'kingfisher', 'hornbill',
+    'toucan', 'woodpecker', 'flycatcher', 'shrike', 'vireo',
+    'wren', 'thrush', 'mockingbird', 'starling', 'pipit'
+]

parseimagenet-1.0.3.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,139 @@
+Metadata-Version: 2.4
+Name: parseimagenet
+Version: 1.0.3
+Summary: Extract ImageNet image paths by category keywords
+Author-email: Reed Turgeon <turgeon.dev@gmail.com>
+License: MIT
+Project-URL: Homepage, https://github.com/MrT3313/Parse-ImageNet
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: MIT License
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Provides-Extra: dev
+Requires-Dist: jupyter; extra == "dev"
+Requires-Dist: ipykernel; extra == "dev"
+Dynamic: license-file
+# ParseImageNet
+Extract image file paths from ImageNet by matching category keywords. Useful for creating custom subsets of ImageNet for training or evaluation.
+Python Version,https://img.shields.io/pypi/pyversions/parseimagenet
+License,https://img.shields.io/github/license/MrT3313/Parse-ImageNet
+Build Status,https://github.com/MrT3313/Parse-ImageNet/actions/workflows/main.yml/badge.svg
+## [Kaggle Dataset](https://www.kaggle.com/competitions/imagenet-object-localization-challenge/data)
+## Prerequisites
+- Python 3.8+
+- ImageNet dataset (or a subset) with the standard ILSVRC directory structure:
+  ```
+  ImageNet-Subset/
+  ├── LOC_synset_mapping.txt
+  └── ILSVRC/
+      ├── ImageSets/
+      │   └── CLS-LOC/
+      │       └── train_cls.txt
+      └── Data/
+          └── CLS-LOC/
+              └── train/
+                  ├── n01440764/
+                  │   ├── n01440764_10026.JPEG
+                  │   └── ...
+                  └── ...
+  ```
+## Installation
+Clone the repository:
+```bash
+git clone https://github.com/MrT3313/Parse-ImageNet.git
+```
+Then install the package into the environment where you run Jupyter:
+```bash
+# Using pip
+pip install -e /path/to/ParseImageNet
+# ex: pip install -e /Users/mrt/Documents/MrT/code/computer-vision/ParseImageNet
+```
+The `-e` flag installs in "editable" mode, so code changes are immediately available without reinstalling. However, changes to package metadata (version, dependencies) in `pyproject.toml` still require running `pip install -e .` again.
+## Usage
+> [!NOTE]
+>
+> [Example Notebook](/DOCS/ExampleNotebook.ipynb)
+### In Jupyter Lab / Jupyter Notebook
+```python
+from pathlib import Path
+from parseimagenet import get_image_paths_by_keywords
+# Set the path to your ImageNet directory
+base_path = Path('/path/to/your/ImageNet-Subset')
+# ex: /Users/mrt/Documents/MrT/code/computer-vision/image-bank/ImageNet-Subset
+# Use the default "birds" preset
+image_paths = get_image_paths_by_keywords(base_path=base_path)
+# image_paths is a list of Path objects
+print(f"Found {len(image_paths)} images")
+print(image_paths[:5])
+```
+#### Using Preset Keywords
+Presets are predefined keyword lists for common categories:
+```python
+from parseimagenet import get_image_paths_by_keywords # main function
+from parseimagenet import get_available_presets, KEYWORD_PRESETS # helpers
+# See available presets
+print(get_available_presets())  # ['birds']
+# Use a specific preset
+image_paths = get_image_paths_by_keywords(
+    base_path=base_path,
+    preset="birds",
+    num_images=200
+)
+# Access preset keywords directly
+print(KEYWORD_PRESETS["birds"])
+```
+#### Using Custom Keywords
+Custom keywords override the preset:
+```python
+image_paths = get_image_paths_by_keywords(
+    base_path=base_path,
+    keywords=['dog', 'puppy', 'hound'],
+    num_images=100
+)
+```
+> [!NOTE]
+>
+> you can find all applicable categories in the `LOC_synset_mapping.txt` file
+### Command Line
+```bash
+# Use default preset (birds)
+python -m parseimagenet.ParseImageNetSubset --base_path /path/to/ImageNet-Subset
+# Use a specific preset
+python -m parseimagenet.ParseImageNetSubset --base_path /path/to/ImageNet-Subset --preset birds --num_images 100
+# Use custom keywords (overrides preset)
+python -m parseimagenet.ParseImageNetSubset --base_path /path/to/ImageNet-Subset --keywords dog puppy --num_images 100
+```

parseimagenet-1.0.3.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+parseimagenet/ParseImageNetSubset.py,sha256=0Labff37IR4rBch6pXEHBNTkX268SHri9BFyu7I0lI0,5989
+parseimagenet/__init__.py,sha256=SYt4M0OvrtFHkMQu3hStVG0BeujmCg3wTEVJAy3J3ZA,247
+parseimagenet/keywords/__init__.py,sha256=40hcsiPwbrHldKSdxVj4OF_7Db4iLu5VFgTn3o272oI,62
+parseimagenet/keywords/birds.py,sha256=0HgdV-pxVOl3gKXk4ilWuqrJvkpP_NTVtzWwBDTahKQ,614
+parseimagenet-1.0.3.dist-info/licenses/LICENSE,sha256=KRcBX6PTDBCujdtWzTfiXjJD61rSazBviMBMEm--GgA,1059
+parseimagenet-1.0.3.dist-info/METADATA,sha256=cqdJTnC5PZPFDKtu3NyRYj0zEuhHp2EnNpiB1zBD-_s,3991
+parseimagenet-1.0.3.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+parseimagenet-1.0.3.dist-info/top_level.txt,sha256=wjb0cdg_P23rfxPx9ZBLbxB_TwRSmXSMxLUFUeUootY,14
+parseimagenet-1.0.3.dist-info/RECORD,,

parseimagenet-1.0.3.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (80.10.2)
+Root-Is-Purelib: true
+Tag: py3-none-any

parseimagenet-1.0.3.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,7 @@
+Copyright 2026 Reed Turgeon
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

parseimagenet-1.0.3.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ parseimagenet