PyPI - gismap - Versions diffs - 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

gismap 0.3.0py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

gismap/__init__.py +1 -0
gismap/build.py +4 -0
gismap/gisgraphs/widget.py +5 -5
gismap/lab/lab_author.py +4 -7
gismap/lab/labmap.py +3 -4
gismap/lab_examples/lamsade.py +43 -0
gismap/sources/dblp.py +1 -1
gismap/sources/dblp_ttl.py +168 -0
gismap/sources/hal.py +2 -2
gismap/sources/ldb.py +501 -0
gismap/sources/multi.py +2 -2
gismap/utils/common.py +15 -10
gismap/utils/logger.py +2 -0
gismap/utils/requests.py +3 -1
gismap/utils/zlist.py +68 -0
{gismap-0.3.0.dist-info → gismap-0.4.0.dist-info}/METADATA +18 -5
{gismap-0.3.0.dist-info → gismap-0.4.0.dist-info}/RECORD +19 -14
{gismap-0.3.0.dist-info → gismap-0.4.0.dist-info}/WHEEL +1 -1
{gismap-0.3.0.dist-info → gismap-0.4.0.dist-info}/licenses/AUTHORS.md +0 -0

gismap/__init__.py CHANGED Viewed

@@ -4,6 +4,7 @@ from importlib.metadata import metadata
 from gismap.sources.hal import HAL as HAL, HALAuthor as HALAuthor
 from gismap.sources.dblp import DBLP as DBLP, DBLPAuthor as DBLPAuthor
+from gismap.sources.ldb import LDB as LDB, LDBAuthor as LDBAuthor
 from gismap.utils.common import get_classes as get_classes
 from gismap.gismo import make_gismo as make_gismo
 from gismap.search import (

gismap/build.py ADDED Viewed

@@ -0,0 +1,4 @@
+if __name__ == "__main__":
+    from gismap.sources.ldb import LDB
+    LDB.build_db()
+    LDB.dump_db()

gismap/gisgraphs/widget.py CHANGED Viewed

@@ -33,7 +33,7 @@ def safe_filename(name):
     return f"gismap-{safe_str[:60]}.html"
-place_holder = "Diego Perino, The-Dang Huynh, François Durand (hal: fradurand, dblp: 38/11269), Rim Kaddah, Leonardo Linguaglossa, Céline Comte"
+place_holder = "Diego Perino, The-Dang Huynh, François Durand (hal: fradurand, ldb: 38/11269), Rim Kaddah, Leonardo Linguaglossa, Céline Comte"
 class GismapWidget:
@@ -66,7 +66,7 @@ class GismapWidget:
             layout=widgets.Layout(width="50%", height="100px"),
         )
         self.dbs = widgets.RadioButtons(
-            options=["HAL", "DBLP", "Both"],
+            options=["HAL", "LDB", "Both"],
             description="DB(s):",
             layout=widgets.Layout(width="80px", max_width="20%"),
         )
@@ -100,9 +100,9 @@ class GismapWidget:
         dbs = (
             "hal"
             if self.dbs.value == "HAL"
-            else "dblp"
-            if self.dbs.value == "DBLP"
-            else ["hal", "dblp"]
+            else "ldb"
+            if self.dbs.value == "LDB"
+            else ["hal", "ldb"]
         )
         name = self.names.value
         pattern = r",\s*(?![^()]*\))"

gismap/lab/lab_author.py CHANGED Viewed

@@ -1,14 +1,14 @@
 from dataclasses import dataclass, field
 import re
-from gismap import get_classes, HAL, DBLP
+from gismap import get_classes
 from gismap.sources.models import DB, db_class_to_auth_class
 from gismap.sources.multi import SourcedAuthor, sort_author_sources
 from gismap.utils.common import LazyRepr, list_of_objects
 from gismap.utils.logger import logger
 db_dict = get_classes(DB, key="db_name")
-default_dbs = [HAL, DBLP]
+default_dbs = ["hal", "ldb"]
 @dataclass(repr=False)
@@ -27,8 +27,6 @@ class AuthorMetadata(LazyRepr):
         Group of the author.
     position: :class:`tuple`
         Coordinates of the author.
-    keys: :class:`dict`
-        Some DB key values of the author.
     """
     url: str = None
@@ -46,12 +44,11 @@ class LabAuthor(SourcedAuthor):
     Improper key/values are ignored (with a warning).
-    >>> dummy= LabAuthor("My Name(img: https://my.url.img, group:me,url:https://mysite.org,hal:key1,dblp:toto,badkey:hello,no_colon_separator)")
+    >>> dummy= LabAuthor("My Name(img: https://my.url.img, group:me,url:https://mysite.org,hal:key1,ldb:toto,badkey:hello,no_colon_separator)")
     >>> dummy.metadata
     AuthorMetadata(url='https://mysite.org', img='https://my.url.img', group='me')
     >>> dummy.sources
-    [HALAuthor(name='My Name', key='key1'), DBLPAuthor(name='My Name', key='toto')]
+    [HALAuthor(name='My Name', key='key1'), LDBAuthor(name='My Name', key='toto')]
     You can enter multiple keys for the same DB. HAL key types are automatically detected.

gismap/lab/labmap.py CHANGED Viewed

@@ -38,7 +38,7 @@ class LabMap(MixInIO):
     ----------
     name: :class:`str`
         Name of the lab. Can be set as class or instance attribute.
-    dbs: :class:`list`, default=[:class:`~gismap.sources.hal.HAL`, :class:`~gismap.sources.dblp.DBLP`]
+    dbs: :class:`list`, default=[:class:`~gismap.sources.hal.HAL`, :class:`~gismap.sources.ldb.LDB`]
         List of DB sources to use.
@@ -57,8 +57,7 @@ class LabMap(MixInIO):
     def __init__(self, name=None, dbs=None):
         if name is not None:
             self.name = name
-        if dbs is not None:
-            self.dbs = list_of_objects(dbs, db_dict, default=default_dbs)
+        self.dbs = dbs
         self.author_selectors = [author_taboo_filter()]
         self.publication_selectors = [
             publication_size_filter(),
@@ -92,7 +91,7 @@ class LabMap(MixInIO):
             if not all(f(author) for f in self.author_selectors):
                 continue
             if len(author.sources) == 0:
-                author.auto_sources(dbs=self.dbs)
+                author.auto_sources(dbs=list_of_objects(self.dbs, db_dict, default=default_dbs))
             if author.sources:
                 self.authors[author.key] = author
             if author.metadata.img is None:

gismap/lab_examples/lamsade.py ADDED Viewed

@@ -0,0 +1,43 @@
+from bs4 import BeautifulSoup as Soup
+from gismap.lab import LabAuthor
+from gismap.lab.lab_author import AuthorMetadata
+from gismap.lab.labmap import LabMap
+from gismap.utils.requests import get
+def lamsade_parse(div):
+    """
+    Parameters
+    ----------
+    div: :class:`~bs4.BeautifulSoup`
+        Soup of the div of one researcher
+    Returns
+    -------
+    :class:`tuple`
+        name, image url (or None), webpage (or None)
+    """
+    img = div.img['src'] if div.img else None
+    url = div.a['href'] if div.a else None
+    name = div.h2.text.strip().title()
+    name = " ".join(name.split(" ", 1)[::-1])
+    return name, img, url
+class Lamsade(LabMap):
+    """
+    Class for handling the Lamsade team (Dauphine).
+    """
+    name = "Lamsade"
+    base_url = "https://www.lamsade.dauphine.fr/"
+    directory = "fr/personnes/enseignants-chercheurs-et-chercheurs.html"
+    def _author_iterator(self):
+        soup = Soup(get(self.base_url+self.directory), features="lxml")
+        for a in soup('div', class_="dauphinecv-item"):
+            name, img, url = lamsade_parse(a)
+            img = self.base_url+img if img else None
+            url = self.base_url+url if url else None
+            yield LabAuthor(name=name, metadata=AuthorMetadata(url=url, img=img, group=self.name))

gismap/sources/dblp.py CHANGED Viewed

@@ -13,7 +13,7 @@ from gismap.utils.requests import get
 class DBLP(DB):
     db_name: ClassVar[str] = "dblp"
     author_backoff: ClassVar[float] = 5.0
-    publi_backoff: ClassVar[float] = 1.0
+    publi_backoff: ClassVar[float] = 5.0
     @classmethod
     def search_author(cls, name, wait=True):

gismap/sources/dblp_ttl.py ADDED Viewed

@@ -0,0 +1,168 @@
+import re
+import zlib
+from contextlib import contextmanager
+from pathlib import Path
+from tqdm.auto import tqdm
+from gismap.utils.requests import session
+from gismap.sources.dblp import DBLP_TYPES
+key_re = r'<https://dblp.org/rec/([^>]+)>'
+title_re = r'.*?dblp:title\s+"([^"]+)"'
+type_re = r'.*?dblp:bibtexType\s+bibtex:(\w+)'
+authors_re = r'.*?dblp:hasSignature\s+(\[.*\])\s*;'
+url_re = r'(?:.*?dblp:primaryDocumentPage <([^>]+)>)?'
+stream_re = r'(?:.*?dblp:publishedInStream ([^;]+) ;)?'
+pages_re = r'(?:.*?dblp:pagination "([^"]+)")?'
+venue_re = r'(?:.*?dblp:publishedIn\s+"([^"]+?)")?'
+year_re = r'.*?"(\d{4})"\^\^<http://www.w3.org/2001/XMLSchema#gYear>'
+pub_re = re.compile("".join([key_re, title_re, type_re, authors_re,
+                             url_re, stream_re, pages_re, venue_re, year_re]), flags=re.S)
+streams_re = re.compile(r'<https://dblp.org/streams/((?:conf|journals)/[^>]+)>')
+authid_re = re.compile(
+    r'\[.*?signatureDblpName\s*?"([^"]+?)(?:\s+\d+)?".*?signatureCreator\s*<https://dblp.org/pid/([^>]+?)>.*?]',
+    flags=re.S)
+def parse_block(dblp_block):
+    """
+    Parameters
+    ----------
+    dblp_block: :class:`str`
+        A DBLP publication, turtle format.
+    Returns
+    -------
+    key: :class:`str`
+        DBLP key.
+    title: :class:`str`
+        Publication title.
+    type: :class:`str`
+        Type of publication.
+    authors: :class:`dict`
+        Publication authors (key -> name)
+    url: :class:`str` or :class:`NoneType`
+        Publication URL.
+    stream: :class:`list` or :class:`NoneType`
+        Publication streams (normalized journal/conf).
+    pages: :class:`str` or :class:`NoneType`
+        Publication pages.
+    venue: :class:`str` or :class:`NoneType`
+        Publication venue (conf/journal).
+    year: :class:`int`
+        Year of publication.
+    """
+    items = pub_re.search(dblp_block)
+    if items is None:
+        return None
+    key, title, typ, authors, url, stream, pages, venue, year = items.groups()
+    typ = typ.lower()
+    typ = DBLP_TYPES.get(typ, typ)
+    if stream:
+        stream = streams_re.findall(stream)
+    authors = {i: n for n, i in authid_re.findall(authors)}
+    if authors:
+        return key, title, typ, authors, url, stream, pages, venue, int(year)
+    return None
+@contextmanager
+def get_stream(source, chunk_size=1024 * 64):
+    """
+    Parameters
+    ----------
+    source: :class:`str` or :class:`~pathlib.Path`
+        Where the content. Can be on a local file or on the Internet.
+    chunk_size: :class:`int`, optional
+        Desired chunk size. For streaming gz content, must be a multiple of 32kB.
+    Yields
+    -------
+    iterable
+        Chunk iterator that streams the content.
+    :class:`int`
+        Source size (used later to compute ETA).
+    """
+    if isinstance(source, str) and source.startswith("https://"):
+        # URL HTTP
+        with session.get(source, stream=True) as r:
+            r.raise_for_status()
+            total = int(r.headers.get("content-length", 0)) or None
+            yield r.iter_content(chunk_size=chunk_size), total
+    else:
+        source = Path(source)
+        if not source.exists():
+            yield [], 0
+            return None
+        total = source.stat().st_size
+        with source.open("rb") as file_handle:
+            def read_chunks():
+                while True:
+                    chunk = file_handle.read(chunk_size)
+                    if not chunk:
+                        break
+                    yield chunk
+            yield read_chunks(), total
+def publis_streamer(source, chunk_size=1024 * 64, encoding="unicode_escape"):
+    """
+    Parameters
+    ----------
+    source: :class:`str` or :class:`~pathlib.Path`
+        Where the DBLP turtle content is. Can be on a local file or on the Internet.
+    chunk_size: :class:`int`, optional
+        Desired chunk size. Must be a multiple of 32kB.
+    encoding: :class:`str`, default=unicode_escape
+        Encoding of stream.
+    Yields
+    -------
+    key: :class:`str`
+        DBLP key.
+    title: :class:`str`
+        Publication title.
+    type: :class:`str`
+        Type of publication.
+    authors: :class:`dict`
+        Publication authors (key -> name).
+    venue: :class:`str`
+        Publication venue (conf/journal).
+    year: :class:`int`
+        Year of publication.
+    """
+    with get_stream(source, chunk_size=chunk_size) as (stream, total):
+        with tqdm(total=total, unit="B", unit_scale=True, unit_divisor=1024, desc="Processing") as pbar:
+            decomp = zlib.decompressobj(16 + zlib.MAX_WBITS)
+            text_buffer = ""
+            for chunk in stream:
+                if not chunk:
+                    continue
+                pbar.update(len(chunk))
+                data = decomp.decompress(chunk)
+                if not data:
+                    continue
+                text_buffer += data.decode(encoding, errors="replace")
+                blocks = text_buffer.split("\n\n")
+                text_buffer = blocks[-1]
+                for block in blocks[:-1]:
+                    pub = parse_block(block)
+                    if pub:
+                        yield pub
+        data = decomp.flush()
+        if data:
+            text_buffer += data.decode(encoding, errors="replace")
+        if text_buffer:
+            blocks = text_buffer.split("\n\n")
+            for block in blocks:
+                pub = parse_block(block)
+                if pub:
+                    yield pub

gismap/sources/hal.py CHANGED Viewed

@@ -121,8 +121,8 @@ class HAL(DB):
         >>> diego = publications[2].authors[3]
         >>> diego
         HALAuthor(name='Diego Perino', key='Diego Perino', key_type='fullname')
-        >>> len(diego.get_publications())
-        28
+        >>> len(diego.get_publications()) > 28
+        True
         >>> publications[-7] # doctest:  +NORMALIZE_WHITESPACE
         HALPublication(title='Upper bounds for stabilization in acyclic preference-based systems',
         authors=[HALAuthor(name='Fabien Mathieu', key='fabien-mathieu')],

gismap/sources/ldb.py ADDED Viewed

@@ -0,0 +1,501 @@
+from dataclasses import dataclass, field
+from functools import lru_cache
+from typing import ClassVar
+from platformdirs import user_data_dir
+from pathlib import Path
+from datetime import datetime, timezone
+import errno
+import json
+import os
+import zstandard as zstd
+import dill as pickle
+import numpy as np
+import numba as nb
+from bof.fuzz import Process
+from gismo.common import safe_write
+from tqdm.auto import tqdm
+import requests
+from gismap.sources.dblp_ttl import publis_streamer
+from gismap.sources.models import DB, Author, Publication
+from gismap.utils.logger import logger
+from gismap.utils.text import asciify
+from gismap.utils.zlist import ZList
+DATA_DIR = Path(user_data_dir(
+    appname="gismap",
+    appauthor=False,
+))
+LDB_STEM = "ldb"
+LDB_PATH = DATA_DIR / f"{LDB_STEM}.pkl.zst"
+TTL_URL = "https://dblp.org/rdf/dblp.ttl.gz"
+# GitHub release asset constants
+GITHUB_REPO = "balouf/gismap"
+GITHUB_API_URL = f"https://api.github.com/repos/{GITHUB_REPO}/releases"
+LDB_ASSET_NAME = "ldb.pkl.zst"
+LDB_META_PATH = DATA_DIR / "ldb_meta.json"
+@dataclass(repr=False)
+class LDB(DB):
+    """
+    Browse DBLP from a local copy of the database.
+    LDB is a class-only database - it should not be instantiated.
+    All methods are classmethods and state is stored in class variables.
+    """
+    db_name: ClassVar[str] = LDB_STEM
+    source: ClassVar[str] = TTL_URL
+    # Class-level state (replaces instance attributes)
+    authors: ClassVar[ZList | None] = None
+    publis: ClassVar[ZList | None] = None
+    keys: ClassVar[dict | None] = None
+    search_engine: ClassVar[Process | None] = None
+    _initialized: ClassVar[bool] = False
+    __hash__ = object.__hash__
+    def __init__(self):
+        raise TypeError(
+            "LDB should not be instantiated. Use class methods directly, e.g., LDB.search_author(name)"
+        )
+    @classmethod
+    def _ensure_loaded(cls):
+        """Lazy-load the database if not already loaded."""
+        if cls._initialized:
+            return
+        if LDB_PATH.exists():
+            cls.load_db()
+        else:
+            logger.info("LDB not found locally. Attempting to retrieve from GitHub...")
+            try:
+                cls.retrieve()
+                cls.load_db()
+            except RuntimeError as e:
+                logger.warning(f"Could not auto-retrieve LDB: {e}")
+    @classmethod
+    def build_db(cls, source=None, limit=None, n_range=2, length_impact=.1, authors_frame=512, publis_frame=256):
+        if source is None:
+            source = cls.source
+        authors_dict = dict()
+        logger.info("Retrieve publications")
+        with ZList(frame_size=publis_frame) as publis:
+            for i, (key, title, typ, authors, url, streams, pages, venue, year) in enumerate(publis_streamer(source)):
+                auth_indices = []
+                for auth_key, auth_name in authors.items():
+                    if auth_key not in authors_dict:
+                        authors_dict[auth_key] = (len(authors_dict), auth_name, [i])
+                    else:
+                        authors_dict[auth_key][2].append(i)
+                    auth_indices.append(authors_dict[auth_key][0])
+                publis.append((key, title, typ, auth_indices, url, streams, pages, venue, year))
+                if i == limit:
+                    break
+        cls.publis = publis
+        logger.info(f"{len(publis)} publications retrieved.")
+        logger.info("Compact authors")
+        with ZList(frame_size=authors_frame) as authors:
+            for key, (_, name, pubs) in tqdm(authors_dict.items()):
+                authors.append((key, name, pubs))
+        cls.authors = authors
+        cls.keys = {k: v[0] for k, v in authors_dict.items()}
+        del authors_dict
+        cls.search_engine = Process(n_range=n_range, length_impact=length_impact)
+        cls.search_engine.fit([asciify(a[1]) for a in authors])
+        cls.search_engine.choices = np.arange(len(authors))
+        cls.search_engine.vectorizer.features_ = cls.numbify_dict(cls.search_engine.vectorizer.features_)
+        logger.info(f"{len(cls.authors)} compacted.")
+        cls._invalidate_cache()
+        cls._initialized = True
+    @classmethod
+    @lru_cache(maxsize=50000)
+    def author_by_index(cls, i):
+        key, name, _ = cls.authors[i]
+        return LDBAuthor(key=key, name=name)
+    @classmethod
+    def author_by_key(cls, key):
+        return cls.author_by_index(cls.keys[key])
+    @classmethod
+    @lru_cache(maxsize=50000)
+    def publication_by_index(cls, i):
+        key, title, typ, authors, url, streams, pages, venue, year = cls.publis[i]
+        if venue is None:
+            venue = "unpublished"
+        return {"key": key, "title": title, "type": typ,
+                "authors": authors,
+                "url": url, "streams": streams, "pages": pages,
+                "venue": venue, "year": year}
+    @classmethod
+    def author_publications(cls, key):
+        cls._ensure_loaded()
+        _, name, pubs = cls.authors[cls.keys[key]]
+        pubs = [cls.publication_by_index(k).copy() for k in pubs]
+        auth_ids = sorted({k for p in pubs for k in p["authors"]})
+        auths = {k: cls.author_by_index(k) for k in auth_ids}
+        for pub in pubs:
+            pub["authors"] = [auths[k] for k in pub["authors"]]
+            metadata = dict()
+            for k in ["url", "streams", "pages"]:
+                v = pub.pop(k)
+                if v is not None:
+                    metadata[k] = v
+            pub["metadata"] = metadata
+        return [LDBPublication(**pub) for pub in pubs]
+    @classmethod
+    @lru_cache(maxsize=1000)
+    def search_author(cls, name, limit=2, score_cutoff=40.0, slack=10.0):
+        cls._ensure_loaded()
+        res = cls.search_engine.extract(asciify(name), limit=limit, score_cutoff=score_cutoff)
+        res = [r[0] for r in res if r[1] > res[0][1] - slack]
+        sorted_ids = {i: cls.author_by_index(i) for i in sorted(res)}
+        return [sorted_ids[i] for i in res]
+    @classmethod
+    def _invalidate_cache(cls):
+        cls.search_author.cache_clear()
+        cls.publication_by_index.cache_clear()
+        cls.author_by_index.cache_clear()
+    @classmethod
+    def from_author(cls, a):
+        return cls.author_publications(a.key)
+    @classmethod
+    def _get_release_info(cls, tag: str | None = None) -> dict:
+        """
+        Fetch release metadata from GitHub API.
+        Parameters
+        ----------
+        tag: :class:`str`, optional
+            Specific release tag (e.g., "v0.4.0"). If None, fetches latest.
+        Returns
+        -------
+        :class:`dict`
+            Release metadata including tag_name and assets.
+        Raises
+        ------
+        :class:`RuntimeError`
+            If release not found or API request fails.
+        """
+        if tag is None:
+            url = f"{GITHUB_API_URL}/latest"
+        else:
+            url = f"{GITHUB_API_URL}/tags/{tag}"
+        try:
+            response = requests.get(url, timeout=30)
+            response.raise_for_status()
+            return response.json()
+        except requests.exceptions.HTTPError as e:
+            if response.status_code == 404:
+                raise RuntimeError(f"Release not found: {tag or 'latest'}") from e
+            raise RuntimeError(f"GitHub API error: {e}") from e
+        except requests.exceptions.RequestException as e:
+            raise RuntimeError(f"Network error fetching release info: {e}") from e
+    @classmethod
+    def _download_file(cls, url: str, dest: Path, desc: str = "Downloading"):
+        """
+        Download file with progress bar.
+        Parameters
+        ----------
+        url : str
+            URL to download from.
+        dest : Path
+            Destination file path.
+        desc : str
+            Description for progress bar.
+        """
+        dest.parent.mkdir(parents=True, exist_ok=True)
+        response = requests.get(url, stream=True, timeout=30)
+        response.raise_for_status()
+        total_size = int(response.headers.get('content-length', 0))
+        with open(dest, 'wb') as f, tqdm(
+            desc=desc,
+            total=total_size,
+            unit='B',
+            unit_scale=True,
+            unit_divisor=1024,
+        ) as pbar:
+            for chunk in response.iter_content(chunk_size=8192):
+                if chunk:
+                    f.write(chunk)
+                    pbar.update(len(chunk))
+    @classmethod
+    def _save_meta(cls, tag: str, url: str, size: int):
+        """Save version metadata to JSON file."""
+        meta = {
+            "tag": tag,
+            "url": url,
+            "size": size,
+            "downloaded_at": datetime.now(timezone.utc).isoformat(),
+        }
+        LDB_META_PATH.parent.mkdir(parents=True, exist_ok=True)
+        with open(LDB_META_PATH, 'w') as f:
+            json.dump(meta, f, indent=2)
+    @classmethod
+    def _load_meta(cls) -> dict | None:
+        """Load version metadata from JSON file."""
+        if not LDB_META_PATH.exists():
+            return None
+        try:
+            with open(LDB_META_PATH, 'r') as f:
+                return json.load(f)
+        except (json.JSONDecodeError, IOError):
+            return None
+    @classmethod
+    def retrieve(cls, version: str | None = None, force: bool = False):
+        """
+        Download LDB database from GitHub releases.
+        Parameters
+        ----------
+        version: :class:`str`, optional
+            Specific release version (e.g., "v0.4.0" or "0.4.0").
+            If None, downloads from latest release.
+        force: :class:`bool`, default=False
+            Download even if same version is installed.
+        Examples
+        --------
+        >> LDB.retrieve()           # Latest release (freshest data)
+        >> LDB.retrieve("v0.4.0")   # Specific version
+        >> LDB.retrieve("0.4.0")    # Also works without 'v' prefix
+        Raises
+        ------
+        RuntimeError
+            If release or asset not found, or download fails.
+        """
+        # Normalize version string (add "v" prefix if missing)
+        tag = None
+        if version is not None:
+            tag = version if version.startswith("v") else f"v{version}"
+        # Fetch release info
+        logger.info(f"Fetching release info for: {tag or 'latest'}")
+        release_info = cls._get_release_info(tag)
+        release_tag = release_info["tag_name"]
+        # Check if already installed (unless force=True)
+        if not force:
+            meta = cls._load_meta()
+            if meta and meta.get("tag") == release_tag and LDB_PATH.exists():
+                logger.info(f"LDB version {release_tag} already installed. Use force=True to re-download.")
+                return
+        # Find ldb.pkl.zst asset in release
+        assets = release_info.get("assets", [])
+        ldb_asset = None
+        for asset in assets:
+            if asset["name"] == LDB_ASSET_NAME:
+                ldb_asset = asset
+                break
+        if ldb_asset is None:
+            raise RuntimeError(
+                f"Asset '{LDB_ASSET_NAME}' not found in release {release_tag}. "
+                f"Available assets: {[a['name'] for a in assets]}"
+            )
+        download_url = ldb_asset["browser_download_url"]
+        asset_size = ldb_asset["size"]
+        logger.info(f"Downloading LDB from release {release_tag} ({asset_size / 1e9:.2f} GB)")
+        # Download with progress bar
+        cls._download_file(download_url, LDB_PATH, desc=f"LDB {release_tag}")
+        # Save version metadata
+        cls._save_meta(release_tag, download_url, asset_size)
+        # Reset initialized flag so next access reloads
+        cls._initialized = False
+        cls._invalidate_cache()
+        logger.info(f"LDB {release_tag} successfully installed to {LDB_PATH}")
+    @classmethod
+    def db_info(cls) -> dict | None:
+        """
+        Return installed version info.
+        Returns
+        -------
+        :class:`dict` or :class:`None`
+            Dictionary with tag, date, size, path; or None if not installed.
+        """
+        meta = cls._load_meta()
+        if meta is None or not LDB_PATH.exists():
+            return None
+        return {
+            "tag": meta.get("tag"),
+            "downloaded_at": meta.get("downloaded_at"),
+            "size": meta.get("size"),
+            "path": str(LDB_PATH),
+        }
+    @classmethod
+    def check_update(cls) -> dict | None:
+        """
+        Check if a newer version is available on GitHub.
+        Returns
+        -------
+        :class:`dict` or None
+            Dictionary with update info if available, None if up to date.
+        """
+        try:
+            release_info = cls._get_release_info()
+            latest_tag = release_info["tag_name"]
+            meta = cls._load_meta()
+            current_tag = meta.get("tag") if meta else None
+            if current_tag == latest_tag:
+                logger.info(f"LDB is up to date: {current_tag}")
+                return None
+            return {
+                "current": current_tag,
+                "latest": latest_tag,
+                "message": f"Update available: {current_tag or 'not installed'} -> {latest_tag}",
+            }
+        except RuntimeError as e:
+            logger.warning(f"Could not check for updates: {e}")
+            return None
+    @classmethod
+    def dump(cls, filename: str, path=".", overwrite=False):
+        """Save class state to file."""
+        # Convert numba dict to regular dict for pickling
+        nb_dict = None
+        if cls.search_engine is not None:
+            nb_dict = cls.search_engine.vectorizer.features_
+            cls.search_engine.vectorizer.features_ = dict(nb_dict)
+        state = {
+            'authors': cls.authors,
+            'publis': cls.publis,
+            'keys': cls.keys,
+            'search_engine': cls.search_engine,
+        }
+        # Use safe_write pattern from gismo.common
+        destination = Path(path) / f"{Path(filename).stem}.pkl.zst"
+        if destination.exists() and not overwrite:
+            print(f"File {destination} already exists! Use overwrite option to overwrite.")
+        else:
+            with safe_write(destination) as f:
+                cctx = zstd.ZstdCompressor(level=3)
+                with cctx.stream_writer(f) as z:
+                    pickle.dump(state, z, protocol=5)
+        # Restore numba dict
+        if cls.search_engine is not None:
+            cls.search_engine.vectorizer.features_ = nb_dict
+    @classmethod
+    def load(cls, filename: str, path="."):
+        """Load class state from file."""
+        dest = Path(path) / f"{Path(filename).stem}.pkl.zst"
+        if not dest.exists():
+            dest = dest.with_suffix(".pkl")
+        if not dest.exists():
+            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), dest)
+        dctx = zstd.ZstdDecompressor()
+        with open(dest, "rb") as f, dctx.stream_reader(f) as z:
+            state = pickle.load(z)
+        cls.authors = state['authors']
+        cls.publis = state['publis']
+        cls.keys = state['keys']
+        cls.search_engine = state['search_engine']
+        if cls.search_engine is not None:
+            cls.search_engine.vectorizer.features_ = cls.numbify_dict(
+                cls.search_engine.vectorizer.features_
+            )
+        cls._invalidate_cache()
+        cls._initialized = True
+    @classmethod
+    def dump_db(cls):
+        DATA_DIR.mkdir(parents=True, exist_ok=True)
+        cls.dump(LDB_STEM, path=DATA_DIR, overwrite=True)
+    @classmethod
+    def load_db(cls):
+        try:
+            cls.load(LDB_STEM, path=DATA_DIR)
+        except FileNotFoundError:
+            logger.warning("No LDB installed. Build or retrieve before using.")
+    @staticmethod
+    def delete_db():
+        if LDB_PATH.exists():
+            LDB_PATH.unlink()
+    @staticmethod
+    def numbify_dict(input_dict):
+        nb_dict = nb.typed.Dict.empty(key_type=nb.types.unicode_type, value_type=nb.types.int64)
+        for k, v in input_dict.items():
+            nb_dict[k] = v
+        return nb_dict
+@dataclass(repr=False)
+class LDBAuthor(Author, LDB):
+    key: str
+    aliases: list = field(default_factory=list)
+    @property
+    def url(self):
+        return f"https://dblp.org/pid/{self.key}.html"
+    def get_publications(self):
+        return LDB.from_author(self)
+@dataclass(repr=False)
+class LDBPublication(Publication, LDB):
+    key: str
+    metadata: dict = field(default_factory=dict)
+    @property
+    def url(self):
+        return self.metadata.get("url", f"https://dblp.org/rec/{self.key}.html")
+    @property
+    def stream(self):
+        if "streams" in self.metadata:
+            return f'https://dblp.org/streams/{self.metadata["streams"][0]}'
+        return None

gismap/sources/multi.py CHANGED Viewed

@@ -15,7 +15,7 @@ def score_author_source(dbauthor):
             return 2
         else:
             return 3
-    elif dbauthor.db_name == "dblp":
+    elif dbauthor.db_name in ["dblp", "ldb"]:
         return 1
     else:
         return 0
@@ -69,7 +69,7 @@ class SourcedAuthor(Author):
 publication_score_rosetta = {
-    "db_name": {"dblp": 1, "hal": 2},
+    "db_name": {"dblp": 1, "ldb": 1, "hal": 2},
     "venue": {"CoRR": -1, "unpublished": -2},
     "type": {"conference": 1, "journal": 2},
 }

gismap/utils/common.py CHANGED Viewed

@@ -30,7 +30,7 @@ def unlist(x):
     return x[0] if (isinstance(x, list) and x) else x
-def get_classes(root, key="name"):
+def get_classes(root, key="name", recurse=False):
     """
     Parameters
     ----------
@@ -38,6 +38,8 @@ def get_classes(root, key="name"):
         Starting class (can be abstract).
     key: :class:`str`, default='name'
         Attribute to look-up
+    recurse: bool, default=False
+        Recursively traverse subclasses.
     Returns
     -------
@@ -50,13 +52,16 @@ def get_classes(root, key="name"):
     >>> from gismap.sources.models import DB
     >>> subclasses = get_classes(DB, key='db_name')
     >>> dict(sorted(subclasses.items())) # doctest: +NORMALIZE_WHITESPACE
-    {'dblp': <class 'gismap.sources.dblp.DBLP'>, 'hal': <class 'gismap.sources.hal.HAL'>}
+    {'dblp': <class 'gismap.sources.dblp.DBLP'>,
+    'hal': <class 'gismap.sources.hal.HAL'>,
+    'ldb': <class 'gismap.sources.ldb.LDB'>}
     """
     result = {
         getattr(c, key): c for c in root.__subclasses__() if getattr(c, key, None)
     }
-    for c in root.__subclasses__():
-        result.update(get_classes(c))
+    if recurse:
+        for c in root.__subclasses__():
+            result.update(get_classes(c, key=key, recurse=True))
     return result
@@ -83,20 +88,20 @@ def list_of_objects(clss, dico, default=None):
     >>> from gismap.sources.models import DB
     >>> subclasses = get_classes(DB, key='db_name')
-    >>> from gismap import HAL, DBLP
-    >>> list_of_objects([HAL, 'dblp'], subclasses)
-    [<class 'gismap.sources.hal.HAL'>, <class 'gismap.sources.dblp.DBLP'>]
+    >>> from gismap import HAL, DBLP, LDB
+    >>> list_of_objects([HAL, 'ldb'], subclasses)
+    [<class 'gismap.sources.hal.HAL'>, <class 'gismap.sources.ldb.LDB'>]
     >>> list_of_objects(None, subclasses, [DBLP])
     [<class 'gismap.sources.dblp.DBLP'>]
-    >>> list_of_objects(DBLP, subclasses)
-    [<class 'gismap.sources.dblp.DBLP'>]
+    >>> list_of_objects(LDB, subclasses)
+    [<class 'gismap.sources.ldb.LDB'>]
     >>> list_of_objects('hal', subclasses)
     [<class 'gismap.sources.hal.HAL'>]
     """
     if default is None:
         default = []
     if clss is None:
-        return default
+        return list_of_objects(clss=default, dico=dico)
     elif isinstance(clss, str):
         return [dico[clss]]
     elif isinstance(clss, list):

gismap/utils/logger.py CHANGED Viewed

@@ -2,3 +2,5 @@ import logging
 logger = logging.getLogger("GisMap")
 """Default logging interface."""
+logger.setLevel(logging.INFO)

gismap/utils/requests.py CHANGED Viewed

@@ -21,7 +21,9 @@ def get(url, params=None, n_trials=10, verify=True):
         Entry point to fetch.
     params: :class:`dict`, optional
         Get arguments (appended to URL).
-    verify: :class:`bool`, optional
+    n_trials: :class:`int`, default=10
+        Number of attempts to fetch URL.
+    verify: :class:`bool`, default=True
         Verify certificates.
     Returns

gismap/utils/zlist.py ADDED Viewed

@@ -0,0 +1,68 @@
+from gismo.common import MixInIO
+import zstandard as zstd
+import numpy as np
+import pickle
+dctx = zstd.ZstdDecompressor()
+cctx = zstd.ZstdCompressor()
+class ZList(MixInIO):
+    """
+    List compressed by frames of elements. Allows to store compressed data in memory with decent seek and scan.
+    Parameters
+    ----------
+    frame_size: :class:`int`
+        Size of each frame in number of elements.
+    """
+    def __init__(self, frame_size=1000):
+        self.frame_size = frame_size
+        self.frame = None
+        self._frame_index = None
+        self._blob = None
+        self._off = None
+        self._n = None
+        self._batch = None
+    def _merge_batch(self):
+        if self._batch:
+            frame = cctx.compress(pickle.dumps(self._batch))
+            self._blob += frame
+            self._off.append(len(self._blob))
+            self._batch = []
+    def append(self, entry):
+        self._batch.append(entry)
+        self._n += 1
+        if len(self._batch) == self.frame_size:
+            self._merge_batch()
+    @property
+    def size(self):
+        return len(self._blob)
+    def __enter__(self):
+        self._blob = bytearray()
+        self._off = [0]
+        self._n = 0
+        self._batch = []
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self._merge_batch()
+        self._blob = bytes(self._blob)
+        self._off = np.array(self._off, dtype=int)
+    def load_frame(self, f):
+        self.frame = pickle.loads(dctx.decompress(self._blob[self._off[f]:self._off[f + 1]]))
+    def __getitem__(self, i):
+        g, f = i // self.frame_size, i % self.frame_size
+        if g != self._frame_index:
+            self.load_frame(g)
+            self._frame_index = g
+        return self.frame[f]
+    def __len__(self):
+        return self._n

{gismap-0.3.0.dist-info → gismap-0.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: gismap
-Version: 0.3.0
+Version: 0.4.0
 Summary: GisMap leverages DBLP and HAL databases to provide cartography tools for you and your lab.
 Project-URL: Repository, https://github.com/balouf/gismap
 Project-URL: Documentation, https://balouf.github.io/gismap
@@ -9,12 +9,14 @@ Maintainer-email: Fabien Mathieu <fabien.mathieu@normalesup.org>
 License-Expression: MIT
 License-File: AUTHORS.md
 Requires-Python: >=3.10
+Requires-Dist: beautifulsoup4>=4.14.2
 Requires-Dist: bof>=0.3.5
 Requires-Dist: distinctipy>=1.3.4
 Requires-Dist: domonic>=0.9.13
 Requires-Dist: gismo>=0.5.2
 Requires-Dist: ipykernel>=6.30.1
 Requires-Dist: ipywidgets>=8.1.8
+Requires-Dist: platformdirs>=4.5.0
 Requires-Dist: tqdm>=4.67.1
 Description-Content-Type: text/markdown
@@ -61,17 +63,28 @@ Install GisMap:
 $ pip install gismap
 ```
-Use GisMap to produce a collaboration graph (HTML):
+Use GisMap to display a collaboration graph (HTML) from a Notebook:
 ```pycon
->>> from gismap.sources.hal import HAL
->>> from gismap.lab import ListLab
->>> lab = ListLab(["Fabien Mathieu", "François Baccelli", "Ludovic Noirie", "Céline Comte", "Sébastien Tixeuil"], dbs="hal")
+>>> from gismap.lab import ListMap
+>>> lab = ListMap(["Fabien Mathieu", "François Baccelli", "Ludovic Noirie", "Céline Comte", "Sébastien Tixeuil"], dbs="hal")
 >>> lab.update_authors()
 >>> lab.update_publis()
 >>> lab.show_html()
 ```
+If you are not using Jupyter Lab/Notebook, rich display will not work.
+Instead, save the HTML and display it on your browser:
+```pycon
+>>> from gismap.lab import ListMap
+>>> lab = ListMap(["Fabien Mathieu", "François Baccelli", "Ludovic Noirie", "Céline Comte", "Sébastien Tixeuil"], dbs="hal")
+>>> lab.update_authors()
+>>> lab.update_publis()
+>>> lab.save_html("my_graph")
+```
 ## Credits
 This package was created with [Cookiecutter][CC] and the [Package Helper 3][PH3] project template.

{gismap-0.3.0.dist-info → gismap-0.4.0.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,6 @@
-gismap/__init__.py,sha256=Zk5ZXwC-MBREnngJh3V9WUtxLSAUKhkyJhV6kUjtxLE,800
+gismap/__init__.py,sha256=FHZLy3T2zFVrFRe3sqSRTuXkZ6DYise9HtXCWWfXrys,866
 gismap/author.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+gismap/build.py,sha256=1oNs3qjm2DNkOP19iVDPderF2Sx3w5qJH4SgirDHKcU,103
 gismap/gismap.py,sha256=h0hwdogXGFqerm-5ZPeT-irPn91pCcQRjiHThXsRzEk,19
 gismap/gismo.py,sha256=oDAryl4XQzHE0tUmOWC-3G1n_zUgTeykPL-JWSDYwe0,6307
 gismap/search.py,sha256=nsUoDsFGeEtvCZ0dB7ooRPC_6qsazkiWx_oM7dHdNV4,4932
@@ -10,29 +11,33 @@ gismap/gisgraphs/groups.py,sha256=1E-7Xrv0uDw2SgqwtdjgeRLVBLaC7agUrrVics4jVLs,24
 gismap/gisgraphs/js.py,sha256=Gbz5nMWORabZkgIdyZAe1sMlnwJZ9jy7sLrx0vYStzI,6283
 gismap/gisgraphs/options.py,sha256=lmUSnfSwrZQyJpGGs16JUGDIQNcJeX4Y0tA8cyC0nuM,817
 gismap/gisgraphs/style.py,sha256=sXNUnv690kxiJiRQZ7lv4iKKrsxMqAfblheJbqesd48,4653
-gismap/gisgraphs/widget.py,sha256=s7W_8N4f2palM0ChVFYyr9ImcQoMBiEw83MIO4HSm6c,4542
+gismap/gisgraphs/widget.py,sha256=ccTgmfs1-23aVFnOv09aKMf07pfsEsgeLdcywVELzL8,4537
 gismap/lab/__init__.py,sha256=ifyZqI9BpC5NRlMfSmJ671tnKWJDoXbo18iDoE-VR1s,181
 gismap/lab/egomap.py,sha256=RabRJSWJ0xrG67l012En0rbi7ukr4R2lR0hc_K7Xp0o,1211
 gismap/lab/expansion.py,sha256=CMUsXqo-shRyb_MiuPRL5-ZgaitxAxjfbSY_fvzi_1E,6236
 gismap/lab/filters.py,sha256=pG_g2POQXMbyUUw0aXOaeyiGBbiSc7M2NzxLCTQrALk,1875
-gismap/lab/lab_author.py,sha256=XwSXvioHDreZWcaWioGW4rjU2zZN10o89ilyfOsWV90,4497
-gismap/lab/labmap.py,sha256=w3dFCyDHM-hEwHEQzrQA8GEOs8juu8F-f18gbufiOe8,5782
+gismap/lab/lab_author.py,sha256=tiv6Z2RUrmfba0zYNS83cPTwN2YyGj7_bcqN2Ak_JXk,4420
+gismap/lab/labmap.py,sha256=jDXFIxe0Jk89wUaweodPxN2thxMgi-hgnqSavhaapZc,5748
 gismap/lab_examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 gismap/lab_examples/cedric.py,sha256=AjgYy5dhzqh3vDsr9ia_hbtSc9_2Ic238rmJO198FMM,1764
+gismap/lab_examples/lamsade.py,sha256=m5uDT9IGpBT1ARknKl44WmFv5b_tLWfvtOjgOThp5fA,1294
 gismap/lab_examples/lincs.py,sha256=-mIVMGQMrtCtJ3N-oCU8j4Ko9mDuhEPB_pA0gaIw4QA,1126
 gismap/lab_examples/lip6.py,sha256=K32Jqe3-o99QYI--akmwBDFAWKgq0HFEk_psC4akR60,1740
 gismap/lab_examples/toulouse.py,sha256=OUKrK0uefn4uvW74qMsF792un203z3OUfKTquLPGBH4,2091
 gismap/sources/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-gismap/sources/dblp.py,sha256=eVd1u09BH-0TgAD3dXn78zsW5Er69mE_vKxPeGDaBw0,4834
-gismap/sources/hal.py,sha256=tZkeDWob4p5fccBRXC10G3kf2rnVAVzwdkf3swjvl0c,10306
+gismap/sources/dblp.py,sha256=FXVsRhrPc0iqsd_a9cMzUYB5YdMxOC4ho3Ip4lCyjtE,4834
+gismap/sources/dblp_ttl.py,sha256=JI_1C7yv1T8TfXMfLNPSFBbCoghYMYoDY7s6K_2arUs,5456
+gismap/sources/hal.py,sha256=VOd7mEUeM0wcfetHYYsX5n4jXNVYQKP12G-iNQsa0XE,10313
+gismap/sources/ldb.py,sha256=KEHREkne7hUy-04VKJOlvzkJQhvKKZJADcvhEBLCgfY,16766
 gismap/sources/models.py,sha256=XlNrQWTF-DQbfIFaSLPsgWPN-c79_0rfr_2jDasgukM,713
-gismap/sources/multi.py,sha256=7aiYuExiCoU_5GCwi_ufhesy44HsAh9lNFx_J444YJs,4690
+gismap/sources/multi.py,sha256=QlVtuQasznXSXSmJryWFWb2ZmaOOJFoEpgn2Js-IGcc,4709
 gismap/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-gismap/utils/common.py,sha256=nx1f60yNwFpl1oz08h-R5o0xK9CbJv9tmYLDk61dwYA,2898
-gismap/utils/logger.py,sha256=1YALIaNYKTqeIyyCnYxzvZTK7x4FTSfYYl5CP9IMw8E,86
-gismap/utils/requests.py,sha256=nPnTh-lfo8cXiCeEhzZJ2AMo0odDtx4slPN1rTE4H_E,1384
+gismap/utils/common.py,sha256=6JhdB_EJnaXwnBGiJutPx5vFEr4wYEvsqKcivVDbGMk,3115
+gismap/utils/logger.py,sha256=zvOPJqC7V6GV4Ov8M9-tnK63c2poDAEcWq_UarOLcpg,117
+gismap/utils/requests.py,sha256=ZSKYJ08MlEtJTHdKYi61KxK6RjYxTBNxWjEUH-EtbbI,1468
 gismap/utils/text.py,sha256=1_9DlduAYh7Nz-yAg-MaCTmdKbPPmuIY20bb87t7JAQ,3810
-gismap-0.3.0.dist-info/METADATA,sha256=BWpxgm1dncPB8ISiMH95WY_Dyn_AyerfVrHIR6X1VYA,3469
-gismap-0.3.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-gismap-0.3.0.dist-info/licenses/AUTHORS.md,sha256=oDR4mptVUBMq0WKIpt19Km1Bdfz3cO2NAOVgwVfTO8g,131
-gismap-0.3.0.dist-info/RECORD,,
+gismap/utils/zlist.py,sha256=F66rilTalbRgqiJaPIxDJxKs_2KFOp2ZEH8Ef_CRxYA,1810
+gismap-0.4.0.dist-info/METADATA,sha256=wrptTFqdKckSixC0KEzzQ8pVH5aes2vDfz-5NKFrS-A,3903
+gismap-0.4.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+gismap-0.4.0.dist-info/licenses/AUTHORS.md,sha256=oDR4mptVUBMq0WKIpt19Km1Bdfz3cO2NAOVgwVfTO8g,131
+gismap-0.4.0.dist-info/RECORD,,

{gismap-0.3.0.dist-info → gismap-0.4.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: hatchling 1.27.0
+Generator: hatchling 1.28.0
 Root-Is-Purelib: true
 Tag: py3-none-any

{gismap-0.3.0.dist-info → gismap-0.4.0.dist-info}/licenses/AUTHORS.md RENAMED Viewed

File without changes

gismap 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

gismap 0.3.0py3-none-any.whl → 0.4.0py3-none-any.whl