PyPI - crfm-helm - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl - Mend

crfm-helm 0.5.4py3-none-any.whl → 0.5.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (580) hide show

helm/benchmark/metrics/ifeval/instructions_registry.py ADDED Viewed

@@ -0,0 +1,182 @@
+# flake8: noqa
+# type: ignore
+# fmt: off
+# The following code was reproduced with minor modifications to `import` statements from the following URL:
+# https://github.com/google-research/google-research/blob/c7f60c013623e613732a096e2a0c2872491ec912/instruction_following_eval/instructions_registry.py
+# coding=utf-8
+# Copyright 2023 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Registry of all instructions."""
+import helm.benchmark.metrics.ifeval.instructions as instructions
+_KEYWORD = "keywords:"
+_LANGUAGE = "language:"
+_LENGTH = "length_constraints:"
+_CONTENT = "detectable_content:"
+_FORMAT = "detectable_format:"
+_MULTITURN = "multi-turn:"
+_COMBINATION = "combination:"
+_STARTEND = "startend:"
+_CHANGE_CASES = "change_case:"
+_PUNCTUATION = "punctuation:"
+INSTRUCTION_DICT = {
+    _KEYWORD + "existence": instructions.KeywordChecker,
+    _KEYWORD + "frequency": instructions.KeywordFrequencyChecker,
+    # TODO(jeffreyzhou): make a proper set of sentences to choose from
+    # _KEYWORD + "key_sentences": instructions.KeySentenceChecker,
+    _KEYWORD + "forbidden_words": instructions.ForbiddenWords,
+    _KEYWORD + "letter_frequency": instructions.LetterFrequencyChecker,
+    _LANGUAGE + "response_language": instructions.ResponseLanguageChecker,
+    _LENGTH + "number_sentences": instructions.NumberOfSentences,
+    _LENGTH + "number_paragraphs": instructions.ParagraphChecker,
+    _LENGTH + "number_words": instructions.NumberOfWords,
+    _LENGTH + "nth_paragraph_first_word": instructions.ParagraphFirstWordCheck,
+    _CONTENT + "number_placeholders": instructions.PlaceholderChecker,
+    _CONTENT + "postscript": instructions.PostscriptChecker,
+    _FORMAT + "number_bullet_lists": instructions.BulletListChecker,
+    # TODO(jeffreyzhou): Pre-create paragraph or use prompt to replace
+    # _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph,
+    _FORMAT + "constrained_response": instructions.ConstrainedResponseChecker,
+    _FORMAT + "number_highlighted_sections": (
+        instructions.HighlightSectionChecker),
+    _FORMAT + "multiple_sections": instructions.SectionChecker,
+    # TODO(tianjianlu): Re-enable rephrasing with preprocessing the message.
+    # _FORMAT + "rephrase": instructions.RephraseChecker,
+    _FORMAT + "json_format": instructions.JsonFormat,
+    _FORMAT + "title": instructions.TitleChecker,
+    # TODO(tianjianlu): Re-enable with specific prompts.
+    # _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker,
+    _COMBINATION + "two_responses": instructions.TwoResponsesChecker,
+    _COMBINATION + "repeat_prompt": instructions.RepeatPromptThenAnswer,
+    _STARTEND + "end_checker": instructions.EndChecker,
+    _CHANGE_CASES
+    + "capital_word_frequency": instructions.CapitalWordFrequencyChecker,
+    _CHANGE_CASES
+    + "english_capital": instructions.CapitalLettersEnglishChecker,
+    _CHANGE_CASES
+    + "english_lowercase": instructions.LowercaseLettersEnglishChecker,
+    _PUNCTUATION + "no_comma": instructions.CommaChecker,
+    _STARTEND + "quotation": instructions.QuotationChecker,
+}
+INSTRUCTION_CONFLICTS = {
+    _KEYWORD + "existence": {_KEYWORD + "existence"},
+    _KEYWORD + "frequency": {_KEYWORD + "frequency"},
+    # TODO(jeffreyzhou): make a proper set of sentences to choose from
+    # _KEYWORD + "key_sentences": instructions.KeySentenceChecker,
+    _KEYWORD + "forbidden_words": {_KEYWORD + "forbidden_words"},
+    _KEYWORD + "letter_frequency": {_KEYWORD + "letter_frequency"},
+    _LANGUAGE
+    + "response_language": {
+        _LANGUAGE + "response_language",
+        _FORMAT + "multiple_sections",
+        _KEYWORD + "existence",
+        _KEYWORD + "frequency",
+        _KEYWORD + "forbidden_words",
+        _STARTEND + "end_checker",
+        _CHANGE_CASES + "english_capital",
+        _CHANGE_CASES + "english_lowercase",
+    },
+    _LENGTH + "number_sentences": {_LENGTH + "number_sentences"},
+    _LENGTH + "number_paragraphs": {
+        _LENGTH + "number_paragraphs",
+        _LENGTH + "nth_paragraph_first_word",
+        _LENGTH + "number_sentences",
+        _LENGTH + "nth_paragraph_first_word",
+    },
+    _LENGTH + "number_words": {_LENGTH + "number_words"},
+    _LENGTH + "nth_paragraph_first_word": {
+        _LENGTH + "nth_paragraph_first_word",
+        _LENGTH + "number_paragraphs",
+    },
+    _CONTENT + "number_placeholders": {_CONTENT + "number_placeholders"},
+    _CONTENT + "postscript": {_CONTENT + "postscript"},
+    _FORMAT + "number_bullet_lists": {_FORMAT + "number_bullet_lists"},
+    # TODO(jeffreyzhou): Pre-create paragraph or use prompt to replace
+    # _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph,
+    _FORMAT + "constrained_response": set(INSTRUCTION_DICT.keys()),
+    _FORMAT
+    + "number_highlighted_sections": {_FORMAT + "number_highlighted_sections"},
+    _FORMAT
+    + "multiple_sections": {
+        _FORMAT + "multiple_sections",
+        _LANGUAGE + "response_language",
+        _FORMAT + "number_highlighted_sections",
+    },
+    # TODO(tianjianlu): Re-enable rephrasing with preprocessing the message.
+    # _FORMAT + "rephrase": instructions.RephraseChecker,
+    _FORMAT
+    + "json_format": set(INSTRUCTION_DICT.keys()).difference(
+        {_KEYWORD + "forbidden_words", _KEYWORD + "existence"}
+    ),
+    _FORMAT + "title": {_FORMAT + "title"},
+    # TODO(tianjianlu): Re-enable with specific prompts.
+    # _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker,
+    _COMBINATION
+    + "two_responses": set(INSTRUCTION_DICT.keys()).difference({
+        _KEYWORD + "forbidden_words",
+        _KEYWORD + "existence",
+        _LANGUAGE + "response_language",
+        _FORMAT + "title",
+        _PUNCTUATION + "no_comma"
+    }),
+    _COMBINATION + "repeat_prompt": set(INSTRUCTION_DICT.keys()).difference({
+        _KEYWORD + "existence",
+        _FORMAT + "title",
+        _PUNCTUATION + "no_comma"
+    }),
+    _STARTEND + "end_checker": {_STARTEND + "end_checker"},
+    _CHANGE_CASES + "capital_word_frequency": {
+        _CHANGE_CASES + "capital_word_frequency",
+        _CHANGE_CASES + "english_lowercase",
+        _CHANGE_CASES + "english_capital",
+    },
+    _CHANGE_CASES + "english_capital": {_CHANGE_CASES + "english_capital"},
+    _CHANGE_CASES + "english_lowercase": {
+        _CHANGE_CASES + "english_lowercase",
+        _CHANGE_CASES + "english_capital",
+    },
+    _PUNCTUATION + "no_comma": {_PUNCTUATION + "no_comma"},
+    _STARTEND + "quotation": {_STARTEND + "quotation", _FORMAT + "title"},
+}
+def conflict_make(conflicts):
+  """Makes sure if A conflicts with B, B will conflict with A.
+  Args:
+    conflicts: Dictionary of potential conflicts where key is instruction id
+      and value is set of instruction ids that it conflicts with.
+  Returns:
+    Revised version of the dictionary. All instructions conflict with
+    themselves. If A conflicts with B, B will conflict with A.
+  """
+  for key in conflicts:
+    for k in conflicts[key]:
+      conflicts[k].add(key)
+    conflicts[key].add(key)
+  return conflicts

helm/benchmark/metrics/ifeval/instructions_registry.pyi ADDED Viewed

@@ -0,0 +1,3 @@
+from typing import Dict, Any
+INSTRUCTION_DICT: Dict[str, Any]

helm/benchmark/metrics/ifeval/instructions_util.py ADDED Viewed

@@ -0,0 +1,153 @@
+# flake8: noqa
+# type: ignore
+# fmt: off
+# The following code was reproduced from the following URL:
+# https://github.com/google-research/google-research/blob/c7f60c013623e613732a096e2a0c2872491ec912/instruction_following_eval/instructions_util.py
+# coding=utf-8
+# Copyright 2023 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utility library of instructions."""
+import functools
+import random
+import re
+from typing import List
+import immutabledict
+import nltk
+WORD_LIST = ["western", "sentence", "signal", "dump", "spot", "opposite", "bottom", "potato", "administration", "working", "welcome", "morning", "good", "agency", "primary", "wish", "responsibility", "press", "problem", "president", "steal", "brush", "read", "type", "beat", "trainer", "growth", "lock", "bone", "case", "equal", "comfortable", "region", "replacement", "performance", "mate", "walk", "medicine", "film", "thing", "rock", "tap", "total", "competition", "ease", "south", "establishment", "gather", "parking", "world", "plenty", "breath", "claim", "alcohol", "trade", "dear", "highlight", "street", "matter", "decision", "mess", "agreement", "studio", "coach", "assist", "brain", "wing", "style", "private", "top", "brown", "leg", "buy", "procedure", "method", "speed", "high", "company", "valuable", "pie", "analyst", "session", "pattern", "district", "pleasure", "dinner", "swimming", "joke", "order", "plate", "department", "motor", "cell", "spend", "cabinet", "difference", "power", "examination", "engine", "horse", "dimension", "pay", "toe", "curve", "literature", "bother", "fire", "possibility", "debate", "activity", "passage", "hello", "cycle", "background", "quiet", "author", "effect", "actor", "page", "bicycle", "error", "throat", "attack", "character", "phone", "tea", "increase", "outcome", "file", "specific", "inspector", "internal", "potential", "staff", "building", "employer", "shoe", "hand", "direction", "garden", "purchase", "interview", "study", "recognition", "member", "spiritual", "oven", "sandwich", "weird", "passenger", "particular", "response", "reaction", "size", "variation", "a", "cancel", "candy", "exit", "guest", "condition", "fly", "price", "weakness", "convert", "hotel", "great", "mouth", "mind", "song", "sugar", "suspect", "telephone", "ear", "roof", "paint", "refrigerator", "organization", "jury", "reward", "engineering", "day", "possession", "crew", "bar", "road", "description", "celebration", "score", "mark", "letter", "shower", "suggestion", "sir", "luck", "national", "progress", "hall", "stroke", "theory", "offer", "story", "tax", "definition", "history", "ride", "medium", "opening", "glass", "elevator", "stomach", "question", "ability", "leading", "village", "computer", "city", "grand", "confidence", "candle", "priest", "recommendation", "point", "necessary", "body", "desk", "secret", "horror", "noise", "culture", "warning", "water", "round", "diet", "flower", "bus", "tough", "permission", "week", "prompt", "connection", "abuse", "height", "save", "corner", "border", "stress", "drive", "stop", "rip", "meal", "listen", "confusion", "girlfriend", "living", "relation", "significance", "plan", "creative", "atmosphere", "blame", "invite", "housing", "paper", "drink", "roll", "silver", "drunk", "age", "damage", "smoke", "environment", "pack", "savings", "influence", "tourist", "rain", "post", "sign", "grandmother", "run", "profit", "push", "clerk", "final", "wine", "swim", "pause", "stuff", "singer", "funeral", "average", "source", "scene", "tradition", "personal", "snow", "nobody", "distance", "sort", "sensitive", "animal", "major", "negotiation", "click", "mood", "period", "arrival", "expression", "holiday", "repeat", "dust", "closet", "gold", "bad", "sail", "combination", "clothes", "emphasis", "duty", "black", "step", "school", "jump", "document", "professional", "lip", "chemical", "front", "wake", "while", "inside", "watch", "row", "subject", "penalty", "balance", "possible", "adult", "aside", "sample", "appeal", "wedding", "depth", "king", "award", "wife", "blow", "site", "camp", "music", "safe", "gift", "fault", "guess", "act", "shame", "drama", "capital", "exam", "stupid", "record", "sound", "swing", "novel", "minimum", "ratio", "machine", "shape", "lead", "operation", "salary", "cloud", "affair", "hit", "chapter", "stage", "quantity", "access", "army", "chain", "traffic", "kick", "analysis", "airport", "time", "vacation", "philosophy", "ball", "chest", "thanks", "place", "mountain", "advertising", "red", "past", "rent", "return", "tour", "house", "construction", "net", "native", "war", "figure", "fee", "spray", "user", "dirt", "shot", "task", "stick", "friend", "software", "promotion", "interaction", "surround", "block", "purpose", "practice", "conflict", "routine", "requirement", "bonus", "hole", "state", "junior", "sweet", "catch", "tear", "fold", "wall", "editor", "life", "position", "pound", "respect", "bathroom", "coat", "script", "job", "teach", "birth", "view", "resolve", "theme", "employee", "doubt", "market", "education", "serve", "recover", "tone", "harm", "miss", "union", "understanding", "cow", "river", "association", "concept", "training", "recipe", "relationship", "reserve", "depression", "proof", "hair", "revenue", "independent", "lift", "assignment", "temporary", "amount", "loss", "edge", "track", "check", "rope", "estimate", "pollution", "stable", "message", "delivery", "perspective", "mirror", "assistant", "representative", "witness", "nature", "judge", "fruit", "tip", "devil", "town", "emergency", "upper", "drop", "stay", "human", "neck", "speaker", "network", "sing", "resist", "league", "trip", "signature", "lawyer", "importance", "gas", "choice", "engineer", "success", "part", "external", "worker", "simple", "quarter", "student", "heart", "pass", "spite", "shift", "rough", "lady", "grass", "community", "garage", "youth", "standard", "skirt", "promise", "blind", "television", "disease", "commission", "positive", "energy", "calm", "presence", "tune", "basis", "preference", "head", "common", "cut", "somewhere", "presentation", "current", "thought", "revolution", "effort", "master", "implement", "republic", "floor", "principle", "stranger", "shoulder", "grade", "button", "tennis", "police", "collection", "account", "register", "glove", "divide", "professor", "chair", "priority", "combine", "peace", "extension", "maybe", "evening", "frame", "sister", "wave", "code", "application", "mouse", "match", "counter", "bottle", "half", "cheek", "resolution", "back", "knowledge", "make", "discussion", "screw", "length", "accident", "battle", "dress", "knee", "log", "package", "it", "turn", "hearing", "newspaper", "layer", "wealth", "profile", "imagination", "answer", "weekend", "teacher", "appearance", "meet", "bike", "rise", "belt", "crash", "bowl", "equivalent", "support", "image", "poem", "risk", "excitement", "remote", "secretary", "public", "produce", "plane", "display", "money", "sand", "situation", "punch", "customer", "title", "shake", "mortgage", "option", "number", "pop", "window", "extent", "nothing", "experience", "opinion", "departure", "dance", "indication", "boy", "material", "band", "leader", "sun", "beautiful", "muscle", "farmer", "variety", "fat", "handle", "director", "opportunity", "calendar", "outside", "pace", "bath", "fish", "consequence", "put", "owner", "go", "doctor", "information", "share", "hurt", "protection", "career", "finance", "force", "golf", "garbage", "aspect", "kid", "food", "boot", "milk", "respond", "objective", "reality", "raw", "ring", "mall", "one", "impact", "area", "news", "international", "series", "impress", "mother", "shelter", "strike", "loan", "month", "seat", "anything", "entertainment", "familiar", "clue", "year", "glad", "supermarket", "natural", "god", "cost", "conversation", "tie", "ruin", "comfort", "earth", "storm", "percentage", "assistance", "budget", "strength", "beginning", "sleep", "other", "young", "unit", "fill", "store", "desire", "hide", "value", "cup", "maintenance", "nurse", "function", "tower", "role", "class", "camera", "database", "panic", "nation", "basket", "ice", "art", "spirit", "chart", "exchange", "feedback", "statement", "reputation", "search", "hunt", "exercise", "nasty", "notice", "male", "yard", "annual", "collar", "date", "platform", "plant", "fortune", "passion", "friendship", "spread", "cancer", "ticket", "attitude", "island", "active", "object", "service", "buyer", "bite", "card", "face", "steak", "proposal", "patient", "heat", "rule", "resident", "broad", "politics", "west", "knife", "expert", "girl", "design", "salt", "baseball", "grab", "inspection", "cousin", "couple", "magazine", "cook", "dependent", "security", "chicken", "version", "currency", "ladder", "scheme", "kitchen", "employment", "local", "attention", "manager", "fact", "cover", "sad", "guard", "relative", "county", "rate", "lunch", "program", "initiative", "gear", "bridge", "breast", "talk", "dish", "guarantee", "beer", "vehicle", "reception", "woman", "substance", "copy", "lecture", "advantage", "park", "cold", "death", "mix", "hold", "scale", "tomorrow", "blood", "request", "green", "cookie", "church", "strip", "forever", "beyond", "debt", "tackle", "wash", "following", "feel", "maximum", "sector", "sea", "property", "economics", "menu", "bench", "try", "language", "start", "call", "solid", "address", "income", "foot", "senior", "honey", "few", "mixture", "cash", "grocery", "link", "map", "form", "factor", "pot", "model", "writer", "farm", "winter", "skill", "anywhere", "birthday", "policy", "release", "husband", "lab", "hurry", "mail", "equipment", "sink", "pair", "driver", "consideration", "leather", "skin", "blue", "boat", "sale", "brick", "two", "feed", "square", "dot", "rush", "dream", "location", "afternoon", "manufacturer", "control", "occasion", "trouble", "introduction", "advice", "bet", "eat", "kill", "category", "manner", "office", "estate", "pride", "awareness", "slip", "crack", "client", "nail", "shoot", "membership", "soft", "anybody", "web", "official", "individual", "pizza", "interest", "bag", "spell", "profession", "queen", "deal", "resource", "ship", "guy", "chocolate", "joint", "formal", "upstairs", "car", "resort", "abroad", "dealer", "associate", "finger", "surgery", "comment", "team", "detail", "crazy", "path", "tale", "initial", "arm", "radio", "demand", "single", "draw", "yellow", "contest", "piece", "quote", "pull", "commercial", "shirt", "contribution", "cream", "channel", "suit", "discipline", "instruction", "concert", "speech", "low", "effective", "hang", "scratch", "industry", "breakfast", "lay", "join", "metal", "bedroom", "minute", "product", "rest", "temperature", "many", "give", "argument", "print", "purple", "laugh", "health", "credit", "investment", "sell", "setting", "lesson", "egg", "middle", "marriage", "level", "evidence", "phrase", "love", "self", "benefit", "guidance", "affect", "you", "dad", "anxiety", "special", "boyfriend", "test", "blank", "payment", "soup", "obligation", "reply", "smile", "deep", "complaint", "addition", "review", "box", "towel", "minor", "fun", "soil", "issue", "cigarette", "internet", "gain", "tell", "entry", "spare", "incident", "family", "refuse", "branch", "can", "pen", "grandfather", "constant", "tank", "uncle", "climate", "ground", "volume", "communication", "kind", "poet", "child", "screen", "mine", "quit", "gene", "lack", "charity", "memory", "tooth", "fear", "mention", "marketing", "reveal", "reason", "court", "season", "freedom", "land", "sport", "audience", "classroom", "law", "hook", "win", "carry", "eye", "smell", "distribution", "research", "country", "dare", "hope", "whereas", "stretch", "library", "if", "delay", "college", "plastic", "book", "present", "use", "worry", "champion", "goal", "economy", "march", "election", "reflection", "midnight", "slide", "inflation", "action", "challenge", "guitar", "coast", "apple", "campaign", "field", "jacket", "sense", "way", "visual", "remove", "weather", "trash", "cable", "regret", "buddy", "beach", "historian", "courage", "sympathy", "truck", "tension", "permit", "nose", "bed", "son", "person", "base", "meat", "usual", "air", "meeting", "worth", "game", "independence", "physical", "brief", "play", "raise", "board", "she", "key", "writing", "pick", "command", "party", "yesterday", "spring", "candidate", "physics", "university", "concern", "development", "change", "string", "target", "instance", "room", "bitter", "bird", "football", "normal", "split", "impression", "wood", "long", "meaning", "stock", "cap", "leadership", "media", "ambition", "fishing", "essay", "salad", "repair", "today", "designer", "night", "bank", "drawing", "inevitable", "phase", "vast", "chip", "anger", "switch", "cry", "twist", "personality", "attempt", "storage", "being", "preparation", "bat", "selection", "white", "technology", "contract", "side", "section", "station", "till", "structure", "tongue", "taste", "truth", "difficulty", "group", "limit", "main", "move", "feeling", "light", "example", "mission", "might", "wait", "wheel", "shop", "host", "classic", "alternative", "cause", "agent", "consist", "table", "airline", "text", "pool", "craft", "range", "fuel", "tool", "partner", "load", "entrance", "deposit", "hate", "article", "video", "summer", "feature", "extreme", "mobile", "hospital", "flight", "fall", "pension", "piano", "fail", "result", "rub", "gap", "system", "report", "suck", "ordinary", "wind", "nerve", "ask", "shine", "note", "line", "mom", "perception", "brother", "reference", "bend", "charge", "treat", "trick", "term", "homework", "bake", "bid", "status", "project", "strategy", "orange", "let", "enthusiasm", "parent", "concentrate", "device", "travel", "poetry", "business", "society", "kiss", "end", "vegetable", "employ", "schedule", "hour", "brave", "focus", "process", "movie", "illegal", "general", "coffee", "ad", "highway", "chemistry", "psychology", "hire", "bell", "conference", "relief", "show", "neat", "funny", "weight", "quality", "club", "daughter", "zone", "touch", "tonight", "shock", "burn", "excuse", "name", "survey", "landscape", "advance", "satisfaction", "bread", "disaster", "item", "hat", "prior", "shopping", "visit", "east", "photo", "home", "idea", "father", "comparison", "cat", "pipe", "winner", "count", "lake", "fight", "prize", "foundation", "dog", "keep", "ideal", "fan", "struggle", "peak", "safety", "solution", "hell", "conclusion", "population", "strain", "alarm", "measurement", "second", "train", "race", "due", "insurance", "boss", "tree", "monitor", "sick", "course", "drag", "appointment", "slice", "still", "care", "patience", "rich", "escape", "emotion", "royal", "female", "childhood", "government", "picture", "will", "sock", "big", "gate", "oil", "cross", "pin", "improvement", "championship", "silly", "help", "sky", "pitch", "man", "diamond", "most", "transition", "work", "science", "committee", "moment", "fix", "teaching", "dig", "specialist", "complex", "guide", "people", "dead", "voice", "original", "break", "topic", "data", "degree", "reading", "recording", "bunch", "reach", "judgment", "lie", "regular", "set", "painting", "mode", "list", "player", "bear", "north", "wonder", "carpet", "heavy", "officer", "negative", "clock", "unique", "baby", "pain", "assumption", "disk", "iron", "bill", "drawer", "look", "double", "mistake", "finish", "future", "brilliant", "contact", "math", "rice", "leave", "restaurant", "discount", "sex", "virus", "bit", "trust", "event", "wear", "juice", "failure", "bug", "context", "mud", "whole", "wrap", "intention", "draft", "pressure", "cake", "dark", "explanation", "space", "angle", "word", "efficiency", "management", "habit", "star", "chance", "finding", "transportation", "stand", "criticism", "flow", "door", "injury", "insect", "surprise", "apartment"]  # pylint: disable=line-too-long
+# ISO 639-1 codes to language names.
+LANGUAGE_CODES = immutabledict.immutabledict({
+    "en": "English",
+    "es": "Spanish",
+    "pt": "Portuguese",
+    "ar": "Arabic",
+    "hi": "Hindi",
+    "fr": "French",
+    "ru": "Russian",
+    "de": "German",
+    "ja": "Japanese",
+    "it": "Italian",
+    "bn": "Bengali",
+    "uk": "Ukrainian",
+    "th": "Thai",
+    "ur": "Urdu",
+    "ta": "Tamil",
+    "te": "Telugu",
+    "bg": "Bulgarian",
+    "ko": "Korean",
+    "pl": "Polish",
+    "he": "Hebrew",
+    "fa": "Persian",
+    "vi": "Vietnamese",
+    "ne": "Nepali",
+    "sw": "Swahili",
+    "kn": "Kannada",
+    "mr": "Marathi",
+    "gu": "Gujarati",
+    "pa": "Punjabi",
+    "ml": "Malayalam",
+    "fi": "Finnish",
+    })
+_ALPHABETS = "([A-Za-z])"
+_PREFIXES = "(Mr|St|Mrs|Ms|Dr)[.]"
+_SUFFIXES = "(Inc|Ltd|Jr|Sr|Co)"
+_STARTERS = r"(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
+_ACRONYMS = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
+_WEBSITES = "[.](com|net|org|io|gov|edu|me)"
+_DIGITS = "([0-9])"
+_MULTIPLE_DOTS = r"\.{2,}"
+def split_into_sentences(text):
+  """Split the text into sentences.
+  Args:
+    text: A string that consists of more than or equal to one sentences.
+  Returns:
+    A list of strings where each string is a sentence.
+  """
+  text = " " + text + "  "
+  text = text.replace("\n", " ")
+  text = re.sub(_PREFIXES, "\\1<prd>", text)
+  text = re.sub(_WEBSITES, "<prd>\\1", text)
+  text = re.sub(_DIGITS + "[.]" + _DIGITS, "\\1<prd>\\2", text)
+  text = re.sub(
+      _MULTIPLE_DOTS,
+      lambda match: "<prd>" * len(match.group(0)) + "<stop>",
+      text,
+  )
+  if "Ph.D" in text:
+    text = text.replace("Ph.D.", "Ph<prd>D<prd>")
+  text = re.sub(r"\s" + _ALPHABETS + "[.] ", " \\1<prd> ", text)
+  text = re.sub(_ACRONYMS + " " + _STARTERS, "\\1<stop> \\2", text)
+  text = re.sub(
+      _ALPHABETS + "[.]" + _ALPHABETS + "[.]" + _ALPHABETS + "[.]",
+      "\\1<prd>\\2<prd>\\3<prd>",
+      text,
+  )
+  text = re.sub(
+      _ALPHABETS + "[.]" + _ALPHABETS + "[.]", "\\1<prd>\\2<prd>", text
+  )
+  text = re.sub(" " + _SUFFIXES + "[.] " + _STARTERS, " \\1<stop> \\2", text)
+  text = re.sub(" " + _SUFFIXES + "[.]", " \\1<prd>", text)
+  text = re.sub(" " + _ALPHABETS + "[.]", " \\1<prd>", text)
+  if "”" in text:
+    text = text.replace(".”", "”.")
+  if '"' in text:
+    text = text.replace('."', '".')
+  if "!" in text:
+    text = text.replace('!"', '"!')
+  if "?" in text:
+    text = text.replace('?"', '"?')
+  text = text.replace(".", ".<stop>")
+  text = text.replace("?", "?<stop>")
+  text = text.replace("!", "!<stop>")
+  text = text.replace("<prd>", ".")
+  sentences = text.split("<stop>")
+  sentences = [s.strip() for s in sentences]
+  if sentences and not sentences[-1]:
+    sentences = sentences[:-1]
+  return sentences
+def count_words(text):
+  """Counts the number of words."""
+  tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
+  tokens = tokenizer.tokenize(text)
+  num_words = len(tokens)
+  return num_words
+@functools.lru_cache(maxsize=None)
+def _get_sentence_tokenizer():
+  return nltk.data.load("nltk:tokenizers/punkt/english.pickle")
+def count_sentences(text):
+  """Count the number of sentences."""
+  tokenizer = _get_sentence_tokenizer()
+  tokenized_sentences = tokenizer.tokenize(text)
+  return len(tokenized_sentences)
+def generate_keywords(num_keywords):
+  """Randomly generates a few keywords."""
+  return random.sample(WORD_LIST, k=num_keywords)

helm/benchmark/metrics/ifeval_metrics.py ADDED Viewed

@@ -0,0 +1,55 @@
+from typing import List
+from helm.common.hierarchical_logger import hlog
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+from helm.benchmark.metrics.ifeval.instructions_registry import INSTRUCTION_DICT
+class IFEvalMetric(Metric):
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        prompt = request_state.instance.input.text
+        assert request_state.instance.extra_data
+        instruction_ids = request_state.instance.extra_data["instruction_ids"]
+        instruction_kwargs = request_state.instance.extra_data["instruction_kwargs"]
+        assert len(instruction_ids) > 0
+        assert request_state.result
+        assert len(request_state.result.completions) == 1, f"Got {len(request_state.result.completions)} completions"
+        response = request_state.result.completions[0].text.strip()
+        # The following logic was reproduced with minor modifications from the following URL:
+        # https://github.com/google-research/google-research/blob/c7f60c013623e613732a096e2a0c2872491ec912/
+        # instruction_following_eval/evaluation_main.py#L96-L125
+        is_following_list = []
+        for index, instruction_id in enumerate(instruction_ids):
+            instruction_cls = INSTRUCTION_DICT[instruction_id]
+            instruction = instruction_cls(instruction_id)
+            instruction.build_description(**{k: v for k, v in instruction_kwargs[index].items() if v is not None})
+            args = instruction.get_instruction_args()
+            if args and "prompt" in args:
+                instruction.build_description(prompt=prompt)
+            is_following = False
+            if response.strip():
+                try:
+                    is_following = instruction.check_following(response)
+                except Exception as e:
+                    hlog(f"WARNING: Instruction following checking failed with error message {e}")
+            if is_following:
+                is_following_list.append(1)
+            else:
+                is_following_list.append(0)
+        return [Stat(MetricName("ifeval_strict_accuracy")).add(sum(is_following_list) / len(is_following_list))]

helm/benchmark/metrics/image_generation/aesthetics_metrics.py CHANGED Viewed

@@ -9,7 +9,7 @@ from helm.benchmark.metrics.statistic import Stat
 from helm.benchmark.metrics.metric import Metric
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric_service import MetricService
-from .aesthetics_scorer import AestheticsScorer
+from helm.benchmark.metrics.image_generation.aesthetics_scorer import AestheticsScorer
 from helm.common.multimodal_request_utils import gather_generated_image_locations

helm/benchmark/metrics/image_generation/detection_metrics.py CHANGED Viewed

@@ -10,7 +10,7 @@ from helm.benchmark.metrics.metric import Metric
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.common.multimodal_request_utils import gather_generated_image_locations
-from .detectors.vitdet import ViTDetDetector
+from helm.benchmark.metrics.image_generation.detectors.vitdet import ViTDetDetector
 class DetectionMetric(Metric):

helm/benchmark/metrics/image_generation/detectors/vitdet.py CHANGED Viewed

@@ -8,7 +8,7 @@ from helm.common.general import ensure_file_downloaded, hlog
 from helm.common.images_utils import open_image
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.gpu_utils import get_torch_device
-from .base_detector import BaseDetector
+from helm.benchmark.metrics.image_generation.detectors.base_detector import BaseDetector
 MODEL_CONFIG_DOWNLOAD_URL: str = "https://drive.google.com/uc?id=1MLuwQ0ZN0gJQ42oVCc0aFz6Rneb1g3Rt"

helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os
-from .fractal_dimension_util import compute_fractal_dimension
+from helm.benchmark.metrics.image_generation.fractal_dimension.fractal_dimension_util import compute_fractal_dimension
 def fractal_dimension_test(image_filename: str, expected_fractal_dimension: float):

helm/benchmark/metrics/image_generation/fractal_dimension_metric.py CHANGED Viewed

@@ -10,7 +10,7 @@ from helm.benchmark.metrics.metric import Metric
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.common.multimodal_request_utils import gather_generated_image_locations
-from .fractal_dimension.fractal_dimension_util import compute_fractal_dimension
+from helm.benchmark.metrics.image_generation.fractal_dimension.fractal_dimension_util import compute_fractal_dimension
 class FractalDimensionMetric(Metric):

helm/benchmark/metrics/image_generation/nsfw_metrics.py CHANGED Viewed

@@ -10,7 +10,7 @@ from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.toxicity_utils import is_prompt_toxic
 from helm.clients.image_generation.dalle2_client import DALLE2Client
 from helm.common.images_utils import is_blacked_out_image
-from .nsfw_detector import NSFWDetector
+from helm.benchmark.metrics.image_generation.nsfw_detector import NSFWDetector
 class NSFWMetric(Metric):

helm/benchmark/metrics/image_generation/q16/test_q16.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import os
+import pytest
-from .q16_toxicity_detector import Q16ToxicityDetector
+from helm.benchmark.metrics.image_generation.q16.q16_toxicity_detector import Q16ToxicityDetector
+@pytest.mark.skip(reason="Skipping due to flakiness.")
 class TestQ16:
     def setup_method(self, method):
         self._q16_detector = Q16ToxicityDetector()

helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py CHANGED Viewed

@@ -9,7 +9,7 @@ from helm.benchmark.metrics.metric import Metric
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.benchmark.metrics.toxicity_utils import is_prompt_toxic
-from .q16.q16_toxicity_detector import Q16ToxicityDetector
+from helm.benchmark.metrics.image_generation.q16.q16_toxicity_detector import Q16ToxicityDetector
 from helm.common.multimodal_request_utils import gather_generated_image_locations

helm/benchmark/metrics/image_generation/skin_tone_metrics.py CHANGED Viewed

@@ -92,9 +92,9 @@ class SkinToneMetric(Metric):
                     and (Cr <= ((-2.2857 * Cb) + 432.85))
                 ):
-                    blue.append(img_rgba[i, j].item(0))
+                    blue.append(img_rgba[i, j].item(2))
                     green.append(img_rgba[i, j].item(1))
-                    red.append(img_rgba[i, j].item(2))
+                    red.append(img_rgba[i, j].item(0))
                 else:
                     img_rgba[i, j] = [0, 0, 0, 0]

helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from typing import List
 import os
-from .watermark_detector import WatermarkDetector
+from helm.benchmark.metrics.image_generation.watermark.watermark_detector import WatermarkDetector
 def test_compute_watermark_probability():

helm/benchmark/metrics/image_generation/watermark_metrics.py CHANGED Viewed

@@ -9,7 +9,7 @@ from helm.benchmark.metrics.metric import Metric
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.metrics.metric_service import MetricService
 from helm.common.multimodal_request_utils import gather_generated_image_locations
-from .watermark.watermark_detector import WatermarkDetector
+from helm.benchmark.metrics.image_generation.watermark.watermark_detector import WatermarkDetector
 class WatermarkMetric(Metric):

helm/benchmark/metrics/instruction_following_critique_metrics.py CHANGED Viewed

@@ -3,10 +3,10 @@ from typing import Dict, List
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
-from .metric import Metric
-from .metric_name import MetricName
-from .metric_service import MetricService
-from .statistic import Stat
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
 from helm.common.critique_request import CritiqueTaskTemplate, CritiqueQuestionTemplate, CritiqueRequest, QuestionType

helm/benchmark/metrics/language_modeling_metrics.py CHANGED Viewed

@@ -11,10 +11,10 @@ from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
-from .metric import MetricInterface, MetricResult, PerInstanceStats, add_context
-from .metric_name import MetricContext, MetricName
-from .metric_service import MetricService
-from .statistic import Stat, merge_stat
+from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats, add_context
+from helm.benchmark.metrics.metric_name import MetricContext, MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat, merge_stat
 class LanguageModelingMetric(MetricInterface):

helm/benchmark/metrics/machine_translation_metrics.py CHANGED Viewed

@@ -3,8 +3,8 @@ from typing import List
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
 from helm.common.optional_dependencies import handle_module_not_found_error
-from .metric_name import MetricName
-from .statistic import Stat
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.statistic import Stat
 try:
     from sacrebleu.metrics import BLEU

crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.4py3-none-any.whl → 0.5.5py3-none-any.whl