RubyGems - commonmeta-ruby - Versions diffs - 3.9.0 → 3.12.0 - Mend

commonmeta-ruby 3.9.0 → 3.12.0

Files changed (32) hide show

checksums.yaml +4 -4
data/Gemfile.lock +12 -11
data/lib/commonmeta/author_utils.rb +12 -5
data/lib/commonmeta/readers/commonmeta_reader.rb +1 -1
data/lib/commonmeta/readers/datacite_reader.rb +120 -108
data/lib/commonmeta/schema_utils.rb +1 -1
data/lib/commonmeta/utils.rb +47 -2
data/lib/commonmeta/version.rb +1 -1
data/lib/commonmeta/writers/commonmeta_writer.rb +1 -1
data/resources/{commonmeta_v0.10.5.json → commonmeta_v0.10.7.json} +21 -5
data/resources/{datacite-v4.json → datacite-v45.json} +26 -5
data/resources/kernel-4/include/datacite-relationType-v4.xsd +2 -0
data/resources/kernel-4/include/datacite-resourceType-v4.xsd +2 -0
data/resources/kernel-4/metadata.xsd +11 -7
data/spec/author_utils_spec.rb +10 -0
data/spec/fixtures/commonmeta.json +1 -1
data/spec/fixtures/datacite-dataset_v4.5.json +736 -0
data/spec/fixtures/datacite-instrument.json +135 -0
data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_datacite_metadata/SoftwareSourceCode.yml +8 -8
data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_datacite_metadata/dissertation.yml +12 -12
data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_datacite_metadata/funding_references.yml +12 -12
data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_datacite_metadata/subject_scheme.yml +22 -22
data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/medium_post_with_institutional_author.yml +317 -0
data/spec/readers/commonmeta_reader_spec.rb +1 -1
data/spec/readers/datacite_reader_spec.rb +68 -14
data/spec/readers/json_feed_reader_spec.rb +25 -0
data/spec/utils_spec.rb +30 -4
data/spec/writers/commonmeta_writer_spec.rb +30 -3
data/spec/writers/csl_writer_spec.rb +1 -0
data/spec/writers/csv_writer_spec.rb +1 -0
data/spec/writers/datacite_writer_spec.rb +0 -1
metadata +7 -4

data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/medium_post_with_institutional_author.yml ADDED Viewed

@@ -0,0 +1,317 @@
+---
+http_interactions:
+- request:
+    method: get
+    uri: https://api.rogue-scholar.org/posts/05f01f68-ef81-47d7-a3c1-40aba91d358f
+    body:
+      encoding: ASCII-8BIT
+      string: ''
+    headers:
+      Connection:
+      - close
+      Host:
+      - api.rogue-scholar.org
+      User-Agent:
+      - http.rb/5.1.1
+  response:
+    status:
+      code: 200
+      message: OK
+    headers:
+      Content-Type:
+      - application/json
+      Content-Length:
+      - '23886'
+      Ratelimit-Limit:
+      - '15'
+      Ratelimit-Remaining:
+      - '14'
+      Ratelimit-Reset:
+      - '3'
+      Date:
+      - Wed, 31 Jan 2024 19:50:01 GMT
+      Server:
+      - Fly/ba9e227a (2024-01-26)
+      Via:
+      - 1.1 fly.io
+      Fly-Request-Id:
+      - 01HNGH4EZV3XQF20H1PZ6X5N07-fra
+    body:
+      encoding: UTF-8
+      string: '{"abstract":null,"archive_url":null,"authors":[{"name":"Research Graph"}],"blog":{"api":false,"archive_prefix":null,"authors":null,"backlog":0,"canonical_url":null,"category":"computerAndInformationSciences","created_at":1706685423,"current_feed_url":null,"description":"Stories
+        by Research Graph on Medium","favicon":"https://cdn-images-1.medium.com/fit/c/150/150/1*laJi0jBkVoGhXid7gD_DmQ.png","feed_format":"application/rss+xml","feed_url":"https://medium.com/@researchgraph/feed","filter":null,"funding":null,"generator":"Medium","generator_raw":"Medium","home_page_url":"https://medium.com/@researchgraph","id":"30da2ca9-8258-4ab5-acca-3919d9a5d98d","indexed":true,"issn":null,"language":"en","license":"https://creativecommons.org/licenses/by/4.0/legalcode","mastodon":"","plan":"Starter","prefix":"10.59350","relative_url":null,"ror":null,"secure":true,"slug":"researchgraph","status":"active","title":"Research
+        Graph","updated_at":1706151454,"use_api":null,"use_mastodon":false,"user_id":"a7e16958-1175-437c-b839-d4b8a47ec811","version":"https://jsonfeed.org/version/1.1"},"blog_name":"Research
+        Graph","blog_slug":"researchgraph","content_text":"**Tools and Platform for
+        Integration of Knowledge Graph with RAG\npipelines.**\n\n<figure>\n<img\nsrc=\"https://cdn-images-1.medium.com/max/1024/1*bJ3eWZ7301vYDzBomwdLfQ.png\"\nalt=\"Complex
+        network connected to books and showing information from magespace\" />\n<figcaption>Image
+        Created in <a\nhref=\"https://www.mage.space/\">https://www.mage.space/</a></figcaption>\n</figure>\n\nAuthors:
+        [Aland\nAstudillo](https://www.linkedin.com/in/aland-astudillo/), [Aishwarya\nNambissan](https://www.linkedin.com/in/aishwarya-nambissan-127229200/)\n\nMany
+        users of chatbots such as ChatGPT, have encountered the problem of\nreceiving
+        inappropriate or incompatible responses. There are several\nreasons why this
+        might\u00a0happen.\n\nOne reason is the lack of appropriate training data,
+        as chatbots are\nusually trained on large amounts of text and code. If the
+        data is\ninsufficient or of poor quality, the chatbot may misunderstand queries\nand
+        provide inaccurate responses. Another reason is that some chatbots\nare designed
+        for specific tasks or domains, which limits their ability\nto handle broader
+        queries or understand subtle nuances in conversation.\nAdditionally, chatbots
+        may struggle with natural language, which is\ncomplex and often ambiguous.
+        This can cause them to misunderstand a\nuser''s query and provide irrelevant
+        or off-topic responses. Finally,\nthere are technical limitations, such as
+        the chatbot''s inability to\nreason or make inferences.\n\nThis article explores
+        a potential solution by combining two influential\napproaches in the field
+        of Natural Language Processing\u200a---\u200aRetrieval\nAugmented Generation
+        (**RAG**) and Knowledge Graphs(**KGs**). We will\ndelve into the partnership
+        between these two entities, discuss the\nnotable technologies and software
+        used in their processes, and highlight\nvarious options for utilizing their
+        combined potential.\n\n### **RAG**\n\nRetrieval-Augmented Generation is the
+        process of optimizing the output\nof a large language model using a knowledge
+        base outside of its training\ndata sources before generating a response. It
+        takes an input and\nretrieves a set of relevant/supporting documents given
+        a source (e.g.,\nWikipedia). This can be thought of as a Large Language Model
+        (LLM) not\njust putting words together, but carefully selecting relevant\ninformation
+        from external sources and Knowledge Graphs to create\nwell-informed and detailed
+        responses.\n\n### RAG Retrieval Techniques\n\nThe following are some crucial
+        technologies that enable RAG''s impressive\nability to retrieve and incorporate
+        relevant information:\n\n**Vector Search**: It transforms text into numerical
+        vectors, capturing\ntheir meaning and nuances in a mathematical space, creating
+        a map of\nrelationships. Similar texts, like those discussing shared topics
+        or\nusing similar language, end up positioned close together in this space,\nallowing
+        vector search to quickly identify them as related. This allows\nlightning-fast
+        comparisons, finding similar texts based on meaning, not\njust keywords.\n\nAlgorithms
+        like [**Faiss**](https://github.com/facebookresearch/faiss)\nand [**Annoy**](https://github.com/spotify/annoy)
+        map text into dense\nvectors, enabling fast comparisons and retrieval of relevant
+        passages\nbased on semantic similarity.\n\n**Passage Ranking**: It is an internal
+        algorithm that scores candidate\ntext passages based on their relevance to
+        a query. It considers factors\nlike keyword frequency, keyword overlap, and
+        document structure to act\nlike a judge, sifting through information to select
+        the most fitting and\ninformative passages.\n\nKeyword overlap measures how
+        often the same keywords appear in **both**\nthe query and the candidate passage,
+        emphasizing shared vocabulary and\npotential relevance. It differs from keyword
+        frequency, which simply\ncounts how often individual keywords appear within
+        a passage, regardless\nof their presence in the\u00a0query.\n\nTechniques
+        like [**BM25**](https://github.com/getalp/wikIR) and\n[**TF-IDF**](https://github.com/marcocor/wikipedia-idf)
+        score candidate\npassages based on keyword overlap and frequency, ensuring
+        retrieved\ninformation truly fits the\u00a0context.\n\n**Graph Neural Networks**
+        (**GNNs**): They are neural networks designed\nto explore and learn from interconnected
+        data like maps, social\nnetworks, and other complex relationships. Unlike
+        traditional processing\nmethods that go through data in a linear fashion,
+        GNNs are capable of\nrecognizing hidden patterns and understanding relationships
+        like \"who\nknows who\" and \"what connects to what\" by \"hopping\" across
+        connections\nin\u00a0data.\n\nConsider a graph as a network of dots(nodes)
+        connected by lines (edges).\nEach dot represents some information, like a
+        person, object, or concept.\nThe lines tell you how these things relate to
+        each\u00a0other.\n\nGNNs work in rounds. In each\u00a0round:\n\n1.  Message
+        Passing: Each node \"talks\" to its neighbors, sending\n    messages along
+        the edges. These messages contain information about\n    the node itself and
+        its features.\n2.  Node Update: Each node receives messages from all its neighbors
+        and\n    combines them with its own information. This update can involve\n    calculations
+        and applying a special function.\n3.  Output Calculation: Based on the updated
+        information, the network\n    calculates an output for each node. This output
+        could be a\n    prediction about the node''s category, its relationship to
+        another\n    node, or some other relevant information.\n\nThis process repeats
+        for multiple rounds, allowing nodes to incorporate\ninformation from their
+        entire neighborhood, not just their direct\nneighbors. As the rounds progress,
+        the network learns to understand the\nrelationships between nodes and the
+        overall structure of the\u00a0graph.\n\nWhen dealing with Knowledge Graphs,
+        frameworks like\n[**PyTorch-Geometric**](https://readthedocs.org/projects/pytorch-geometric/)\nand
+        [**DeepMind''s\nGNN**](https://github.com/deepmind/deepmind-research/blob/master/learning_to_simulate/graph_network.py)\nlibrary
+        come into play. These frameworks allow GNNs to traverse\ninterconnected entities
+        and relationships within the graph, retrieve\nrelevant knowledge fragments,
+        and understand complex connections.\n\n### **Knowledge Graphs: The Structured
+        Wisdom\u00a0Library**\n\nA knowledge graph, also referred to as a semantic
+        network, is a\nstructure that represents a network of real-world entities
+        such as\nobjects, events, situations, or concepts. It helps to illustrate
+        the\nconstantly changing representations of the world, connecting entities\n(such
+        as \"Marie Curie\") and relationships (such as \"won Nobel Prize\") to\nform
+        a complex network of information. This information is typically\nstored in
+        a graph database and visualized as a graph structure, thus the\nterm knowledge
+        \"graph\".\n\nKGs go beyond simply finding relevant facts and delve deeper
+        into\nunderstanding the relationships and insights hidden within using these\nprocesses:\n\n**Entity
+        Linking**: Imagine a vast network of information, like a big\npuzzle of dots.
+        Now imagine trying to connect specific names, places,\nand concepts to their
+        corresponding dots in the puzzle. That is what\nentity linking does with text
+        and knowledge graphs, connecting the\nspecific components of the text to the
+        corresponding nodes in the graph.\nThey help systems understand the exact
+        meaning of entities, and find\nrelevant information from the\u00a0graph.\n\nLibraries
+        like [**DGL-KeLP**](https://github.com/awslabs/dgl-ke)\nleverage GNNs to identify
+        and link named entities (like \"Marie Curie\")\nto their respective nodes
+        within the Knowledge Graphs, enabling RAG to\nretrieve information that is
+        directly relevant to the core subject of a\nsearch\u00a0query\n\n**Path Mining**:
+        Path mining is a process of uncovering hidden\nrelationships and patterns
+        that are not easily noticeable. It involves\nexploring complicated networks
+        of information and identifying and\ntracing connections between entities that
+        may seem unrelated. By doing\nso, path mining reveals surprising insights
+        and useful knowledge,\nimproving our understanding of the complex structures
+        within knowledge\ngraphs.\n\nTools like [**Neo4j**](https://neo4j.com/) and\n[**Stanza**](https://github.com/stanfordnlp/stanza)
+        allow traversing\npaths between entities, uncovering hidden relationships,
+        and generating\ninsightful responses based on this deeper understanding.\n\n**Reasoning
+        and Inference**: In the context of knowledge graphs,\nreasoning and inference
+        are not just limited to discovering facts; they\nare also concerned with utilizing
+        them effectively. This involves\nintegrating data, drawing meaningful connections,
+        and using logical\nreasoning to resolve issues, foresee future occurrences,
+        or even\nconstruct narratives leveraging the insights provided by the knowledge\ngraph.\n\nConsider
+        the scenario of trying to find an organization that works in\nspecific sectors
+        with the help of a knowledge graph. This analogy\neffectively highlights the
+        active role of reasoning and inference in\nknowledge graphs:\n\n1.  Gathering
+        Facts: Knowledge graphs collect and organize information\n    from various
+        sources, such as websites, databases, academic papers,\n    and social media
+        platforms. These facts are represented as\n    structured data, with entities
+        (e.g., organizations) and their\n    attributes (e.g., sectors in which they
+        operate) forming nodes and\n    edges in the graph. By combining data about
+        organizations and\n    sectors, knowledge graphs enable the gathering of relevant
+        facts for\n    analysis.\n2.  Integrating information: By connecting an organization''s\n    relationships
+        with specific sectors, such as partnerships,\n    investments, or certifications,
+        knowledge graphs reveal the scope\n    and relevance of their work within
+        those sectors. Links to related\n    entities like employees, board members,
+        or projects can further\n    contribute to understanding an organization''s
+        involvement in\n    specific\u00a0sectors.\n3.  Predicting and Creating: Knowledge
+        graphs can leverage machine\n    learning and predictive models to infer missing
+        or hidden\n    information. By analyzing the available facts and connections
+        within\n    the graph, these models can predict an organization''s potential\n    involvement
+        in sectors that have common attributes with their known\n    areas of operation.
+        For example, if an organization has expertise in\n    renewable energy, predictive
+        models could suggest their likely\n    involvement in related sectors like
+        clean transportation or\n    sustainable infrastructure. Additionally, knowledge
+        graphs\n    facilitate the creation of new information and insights by combining\n    existing
+        facts with external data sources. For instance, by\n    integrating real-time
+        data on industry trends, market analysis, or\n    news articles, knowledge
+        graphs enable the discovery of emerging\n    sectors or upcoming organizations
+        that might align with the given\n    parameters.\n\nA framework like [**Atomspace**](https://github.com/opencog/atomspace)\nfrom
+        [**OpenCog**](https://opencog.org/) empowers RAG to reason and\ninfer new
+        knowledge. By traversing paths and combining information from\ninterconnected
+        entities, the system can generate informed predictions or\nanswer hypothetical
+        questions.\n\n### Purpose\n\nThe combination of Retrieval-Augmented Generation
+        (RAG) and Knowledge\nGraphs (KG) is beneficial for several\u00a0reasons:\n\n1.  **Enhanced
+        information retrieval**: Knowledge graphs provide\n    structured and interconnected
+        information that can significantly\n    improve the effectiveness of information
+        retrieval. By using KGs,\n    RAG models can retrieve more accurate and relevant
+        information,\n    leading to better generation and response\u00a0quality.\n2.  **Reliable
+        and diverse information:** KGs are constructed from\n    authoritative sources,
+        making them reliable and trustworthy sources\n    of information. RAG models
+        can leverage this reliable information to\n    generate more accurate responses.
+        Additionally, KGs help in\n    diversifying the generated responses by providing
+        a broader pool of\n    related facts and entities.\n3.  **Context-aware understanding**:
+        KGs enable RAG models to understand\n    and reason over the contextual information.
+        By leveraging the\n    relationships and semantic connections encoded in KGs,
+        RAG models\n    can better grasp the context of user queries or conversations,\n    resulting
+        in more coherent and appropriate responses.\n4.  **Handling complex queries**:
+        KGs allow RAG models to tackle complex\n    queries by breaking them down
+        into smaller sub-queries, retrieving\n    relevant pieces of information from
+        the KG, and then generating a\n    response based on the retrieved knowledge.
+        This enables RAG models\n    to handle a wide range of user queries effectively.\n5.  **Explainability
+        and transparency**: KGs provide a transparent and\n    interpretable representation
+        of knowledge. By integrating KG-based\n    retrieval into RAG models, the
+        reasoning behind the generated\n    responses becomes more explainable. Users
+        can have a clear\n    understanding of the knowledge sources and connections
+        used to\n    produce the response.\n6.  **Scalability**: Knowledge graphs
+        act as large-scale repositories of\n    information. RAG models can leverage
+        KGs to generate responses to\n    various queries or conversations without
+        requiring additional\n    supervised training data. This makes the RAG+KG
+        approach scalable to\n    handle an extensive range of knowledge domains and
+        user\u00a0queries.\n\n### **Pipeline Possibilities: Orchestrating RAG and\u00a0KGs:**\n\nLet''s
+        explore some exciting pipeline options for harnessing the combined\npower
+        of RAG and Knowledge Graphs. There are two options in which either\nthe LLM
+        is prioritized or the Knowledge Graph is prioritized:\n\n**Option 1: LLM-Centric
+        Pipeline:**\n\nThe LLM-Centric pipeline is a RAG and Knowledge Graph combination
+        that\nempowers LLMs to craft well-informed responses. Here''s how it\u00a0works:\n\n1.  Start
+        with the user''s question or statement\n2.  The LLM (like GPT-3) generates
+        an initial draft response based on\n    its internal knowledge. This draft
+        may lack specific factual details\n    or nuances that a knowledge graph can\u00a0provide.\n3.  RAG
+        kicks in, searching the text corpus or the Knowledge Graph for\n    relevant
+        passages that enrich the draft. During the retrieval\n    process, RAG retrieval
+        techniques are used to search not only text\n    corpora but also knowledge
+        graphs to find relevant information. This\n    means that RAG can directly
+        tap into the structured knowledge within\n    the graph to retrieve facts,
+        relationships, and entities that align\n    with the user''s query and the
+        LLM''s generated draft.\n4.  The retrieved information is carefully fused
+        with the LLM''s output,\n    creating a more factually accurate and insightful
+        response\n5.  A final polishing step ensures the response is fluent, grammatically\n    correct,
+        and ready to\u00a0show.\n\n<figure>\n<img\nsrc=\"https://cdn-images-1.medium.com/max/1024/0*3pd9MOIflkbS07wI\"
+        />\n<figcaption>RAG LLM-centric generic\u00a0scheme.</figcaption>\n</figure>\n\nThe
+        basic steps to perform this\u00a0are:\n\n1.  **Pre-processing**: Clean and
+        tokenize user input to prepare for\n    processing.\n2.  **LLM Generation**:
+        Generate an initial draft response using an LLM\n    like [**GPT-3**](https://openai.com/product)
+        or [**Jurassic-1\n    Jumbo**](https://www.livescience.com/google-sentient-ai-lamda-lemoine).\n3.  **Retrieval**:
+        Employ RAG techniques to retrieve relevant passages\n    from a text corpus
+        or Knowledge Graphs.\n4.  **Fusion**: Integrate retrieved information into
+        the LLM-generated\n    draft, creating a more informed and factually-grounded
+        response.\n5.  **Post-processing**: Refine the final response for fluency,\n    grammatical
+        correctness, and overall coherence.\n\n**Option 2: Knowledge Graphs-Centric
+        Pipeline:**\n\nIn this approach, knowledge graphs take center stage. In essence,
+        this\npipeline prioritizes the structured knowledge within knowledge graphs,\nusing
+        RAG retrieval techniques to translate those insights into\ncompelling and
+        informative language. Here''s how it\u00a0unfolds:\n\n1.  User input: The
+        process begins with the user''s question or statement\n2.  Graph exploration:
+        The knowledge graph is meticulously explored to\n    identify relevant entities,
+        relationships, and paths that align with\n    the user''s input. This stage
+        involves techniques like entity\n    linking, path mining, and reasoning to
+        uncover valuable information\n    within the\u00a0graph\n3.  Response planning:
+        The insights extracted from the graph are used to\n    create a structured
+        response plan. This plan outlines the key\n    points, facts, and logical
+        flow that the final response\n    should\u00a0embody\n4.  Language generation:
+        This is where RAG steps in. Its purpose is to\n    create human-like text
+        that follows the response plan. It uses LLMs\n    to produce well-written
+        sentences and paragraphs, combining the\n    relevant information from the
+        knowledge graph while maintaining\n    cohesiveness and readability.\n5.  Post-processing:
+        The generated response undergoes a final refinement\n    process to ensure
+        grammatical correctness, clarity, and\n    overall\u00a0quality\n\n<figure>\n<img\nsrc=\"https://cdn-images-1.medium.com/max/1024/0*mZ83esKBjbPmCq_C\"
+        />\n<figcaption>RAG Knowledge Graph-centric generic\u00a0scheme.</figcaption>\n</figure>\n\nThe
+        basic steps\u00a0are:\n\n1.  **Query Formulation**: Transform the user input
+        into a query\n    suitable for Knowledge Graph''s exploration.\n2.  **Knowledge
+        Graphs:** You can use either Neo4j or\n    [NebulaGraph](https://www.nebula-graph.io/)
+        to implement a retrieval\n    enhancement technique. This technique involves
+        utilizing a knowledge\n    graph to illustrate the connections between entities
+        and\n    relationships. Additionally, it incorporates a powerful language\n    model
+        to improve the retrieval process.\n3.  **Fact Selection**: Employ entity linking
+        and reasoning algorithms\n    to select and prioritize the most relevant facts
+        based on the query\n    and\u00a0context.\n4.  **Natural Language Generation**
+        (**NLG**): Utilise specialized NLG\n    models like\n    [BART](https://research.facebook.com/publications/controllable-abstractive-summarization/)\n    to
+        translate the extracted facts into a natural language response.\n5.  **Refinement**:
+        Enhance the generated response for clarity and\n    coherence.\n\n### **Unveiling
+        a Future of Intelligent Interaction**\n\nThe combination of RAG and Knowledge
+        Graphs goes beyond just being a\ntechnological fusion. It paves the way for
+        a future where the\ninteraction between humans and computers goes beyond simple
+        words and\nbecomes a more informed and refined form of communication. As these\ntechnologies
+        continue to develop, we can expect to witness a significant\ntransformation
+        in:\n\n- AI-powered assistants that answer your questions with the confidence\n  of
+        a well-read friend, seamlessly combining relevant facts and\n  insights gleaned
+        from Knowledge Graphs.\n- Next-generation search engines that go beyond keyword
+        matching,\n  understanding the deeper meaning behind your queries and delivering\n  comprehensive,
+        contextual results enriched with information from\n  Knowledge Graphs.\n-
+        Creative writing tools that utilize RAG and Knowledge Graphs to\n  generate
+        stories that are both factually accurate and full of\n  unexpected plot twists
+        and character development, moving beyond\n  clich\u00e9d patterns.\n\n###
+        **Conclusion**\n\nThe convergence of Retrieval Augmented Generation (RAG)
+        and Knowledge\nGraphs (KGs) brings about an exciting synergy in the world
+        of Natural\nLanguage Processing (NLP). RAG enhances the output of large language\nmodels
+        by carefully selecting relevant information from external sources\nand KGs,
+        allowing for well-informed and detailed responses. KGs, on the\nother hand,
+        provide a structured representation of real-world entities\nand their relationships,
+        enabling the exploration of hidden insights and\nthe discovery of complex
+        connections.\n\nThe integration of RAG and KGs opens up two pipeline possibilities.
+        The\nLLM-centric pipeline prioritizes the language model''s output, which
+        is\nthen enriched with information retrieved from KGs. The Knowledge\nGraphs-centric
+        pipeline, on the other hand, places KGs at the center,\nutilizing RAG techniques
+        to translate the structured insights into\ncompelling and informative language.\n\nWhile
+        integrating LLMs and a knowledge graph for content retrieval\nrequires careful
+        planning, the reward is significant. You can gain\naccess to hidden relationships
+        within information, ultimately leading to\nhigher-quality output information.\n\nTools
+        like **OpenAI**, **Langchain**, and **LlamaIndex** provide\nready-made pipelines
+        to integrate knowledge graphs (like **Neo4j**)\neasily. Meanwhile, open-source
+        LLMs like **Mistral**, **Llama**, and\n**Dolphin** are catching up to proprietary
+        models in performance, making\nthem attractive choices for building custom
+        architectures. This\nopen-source scenario allows for the exploration and examination
+        of\nvarious methods before fully committing to a particular technological\nframework.
+        So, it is crucial to evaluate your needs and choose the\napproach that best
+        fits your use\u00a0case.\n\n![](https://medium.com/_/stat?event=post.clientViewed&referrerSource=full_rss&postId=fc0a6900f7eb){width=\"1\"\nheight=\"1\"}\n","doi":"https://doi.org/10.59350/jhrs4-22440","guid":"https://medium.com/p/fc0a6900f7eb","id":"05f01f68-ef81-47d7-a3c1-40aba91d358f","image":"https://cdn-images-1.medium.com/max/1024/1*bJ3eWZ7301vYDzBomwdLfQ.png","indexed_at":1706690571,"language":"en","published_at":1705557796,"reference":[],"relationships":[],"summary":"<strong>\n
+        Tools and Platform for Integration of Knowledge Graph with RAG pipelines.\n</strong>\nAuthors:
+        Aland Astudillo, Aishwarya Nambissan Many users of chatbots such as ChatGPT,
+        have encountered the problem of receiving inappropriate or incompatible responses.
+        There are several reasons why this might\u00a0happen. One reason is the lack
+        of appropriate training data, as chatbots are usually trained on large amounts
+        of text and code.","tags":["Artificial-intelligence","Machine-learning","Retrieval-augmented","Knowledge-graph"],"title":"Unveiling
+        the Synergy: Retrieval Augmented Generation (RAG) Meets Knowledge Graphs","updated_at":1705557796,"url":"https://medium.com/@researchgraph/unveiling-the-synergy-retrieval-augmented-generation-rag-meets-knowledge-graphs-fc0a6900f7eb"}
+        '
+  recorded_at: Wed, 31 Jan 2024 19:50:01 GMT
+recorded_with: VCR 6.2.0

data/spec/readers/commonmeta_reader_spec.rb CHANGED Viewed

@@ -10,7 +10,7 @@ describe Commonmeta::Metadata, vcr: true do
   context 'read commonmeta metadata' do
     it "default" do
       expect(subject.valid?).to be true
-      expect(subject.schema_version).to eq("https://commonmeta.org/commonmeta_v0.10.5.json")
+      expect(subject.schema_version).to eq("https://commonmeta.org/commonmeta_v0.10")
       expect(subject.id).to eq("https://doi.org/10.7554/elife.01567")
       expect(subject.type).to eq("JournalArticle")
       expect(subject.url).to eq("https://elifesciences.org/articles/01567")

data/spec/readers/datacite_reader_spec.rb CHANGED Viewed

@@ -36,13 +36,12 @@ describe Commonmeta::Metadata, vcr: true do
       # expect(subject.valid?).to be true
       expect(subject.id).to eq("https://doi.org/10.5063/f1m61h5x")
       expect(subject.type).to eq("Software")
-      expect(subject.contributors).to eq([{"contributorRoles"=>["Author"],
-        "name"=>
-        "Jones, Matthew B.; Slaughter, Peter; Nahf, Rob; Boettiger, Carl ; Jones, Chris; Read, Jordan; Walker, Lauren; Hart, Edmund; Chamberlain, Scott",
-        "type"=>"Organization"}])
-      expect(subject.titles).to eq([{"title"=>"dataone: R interface to the DataONE network of data repositories"}])
-      expect(subject.date).to eq("created"=>"2016-03-12", "published"=>"2016", "registered"=>"2016-03-12", "updated"=>"2020-09-18")
-      expect(subject.publisher).to eq("name"=>"KNB Data Repository")
+      expect(subject.contributors).to eq([{ "contributorRoles" => ["Author"],
+                                            "name" => "Jones, Matthew B.; Slaughter, Peter; Nahf, Rob; Boettiger, Carl ; Jones, Chris; Read, Jordan; Walker, Lauren; Hart, Edmund; Chamberlain, Scott",
+                                            "type" => "Organization" }])
+      expect(subject.titles).to eq([{ "title" => "dataone: R interface to the DataONE network of data repositories" }])
+      expect(subject.date).to eq("created" => "2016-03-12", "published" => "2016", "registered" => "2016-03-12", "updated" => "2020-09-18")
+      expect(subject.publisher).to eq("name" => "KNB Data Repository")
       expect(subject.provider).to eq("DataCite")
     end
@@ -70,8 +69,8 @@ describe Commonmeta::Metadata, vcr: true do
       expect(subject.type).to eq("Dissertation")
       expect(subject.contributors.length).to eq(3)
       expect(subject.contributors.first).to eq("type" => "Person", "contributorRoles" => ["Author"],
-                                            "givenName" => "Heiko", "familyName" => "Conrad")
-      expect(subject.contributors.last).to eq("id"=>"https://orcid.org/0000-0002-8633-8234", "type"=>"Person", "contributorRoles"=>["Supervision"], "givenName"=>"Gerhard", "familyName"=>"Gruebel", "affiliation"=>[{"name"=>"Deutsches Elektronen-Synchrotron"}])
+                                               "givenName" => "Heiko", "familyName" => "Conrad")
+      expect(subject.contributors.last).to eq("id" => "https://orcid.org/0000-0002-8633-8234", "type" => "Person", "contributorRoles" => ["Supervision"], "givenName" => "Gerhard", "familyName" => "Gruebel", "affiliation" => [{ "name" => "Deutsches Elektronen-Synchrotron" }])
       expect(subject.titles).to eq([{ "title" => "Dynamics of colloids in molecular glass forming liquids studied via X-ray photon correlation spectroscopy" }])
       expect(subject.date).to eq("created" => "2018-01-25", "published" => "2014",
                                  "registered" => "2018-01-25", "updated" => "2020-09-19")
@@ -91,7 +90,7 @@ describe Commonmeta::Metadata, vcr: true do
         "affiliation" => [{ "name" => "Тверская государственная сельскохозяйственная академия" }], "familyName" => "Ганичева", "givenName" => "А.В.", "type" => "Person", "contributorRoles" => ["Author"],
       )
       expect(subject.titles.last).to eq("title" => "MODEL OF SYSTEM DYNAMICS OF PROCESS OF TRAINING",
-                                        "titleType" => "TranslatedTitle")
+                                        "type" => "TranslatedTitle")
       expect(subject.date).to eq("created" => "2019-02-12", "published" => "2019",
                                  "registered" => "2019-02-12", "updated" => "2022-08-23")
       expect(subject.publisher).to eq("name" => "МОДЕЛИРОВАНИЕ, ОПТИМИЗАЦИЯ И ИНФОРМАЦИОННЫЕ ТЕХНОЛОГИИ")
@@ -115,10 +114,14 @@ describe Commonmeta::Metadata, vcr: true do
       expect(subject.contributors.first).to eq(
         "name" => "Europäische Kommission", "contributorRoles" => ["Author"], "type" => "Organization",
       )
-      expect(subject.titles).to eq([
-                                     { "lang" => "de",
-                                       "title" => "Flash Eurobarometer 54 (Madrid Summit)" }, { "lang" => "en", "title" => "Flash Eurobarometer 54 (Madrid Summit)" }, { "titleType" => "Subtitle", "lang" => "de", "title" => "The Common European Currency" }, { "titleType" => "Subtitle", "lang" => "en", "title" => "The Common European Currency" },
-                                   ])
+      expect(subject.titles).to eq([{ "language" => "de", "title" => "Flash Eurobarometer 54 (Madrid Summit)" },
+                                    { "language" => "en", "title" => "Flash Eurobarometer 54 (Madrid Summit)" },
+                                    { "language" => "de",
+                                      "title" => "The Common European Currency",
+                                      "type" => "Subtitle" },
+                                    { "language" => "en",
+                                      "title" => "The Common European Currency",
+                                      "type" => "Subtitle" }])
       expect(subject.subjects).to eq([{ "lang" => "en",
                                         "subject" => "KAT12 International Institutions, Relations, Conditions",
                                         "subjectScheme" => "ZA" },
@@ -155,5 +158,56 @@ describe Commonmeta::Metadata, vcr: true do
       expect(subject.license).to eq("id" => "CC-BY-4.0",
                                     "url" => "https://creativecommons.org/licenses/by/4.0/legalcode")
     end
+    it "dataset schema v4.5" do
+      input = "#{fixture_path}datacite-dataset_v4.5.json"
+      subject = described_class.new(input: input)
+      expect(subject.id).to eq("https://doi.org/10.82433/b09z-4k37")
+      expect(subject.type).to eq("Dataset")
+      expect(subject.contributors.length).to eq(23)
+      expect(subject.contributors[0]).to eq("contributorRoles" => ["Author"], "familyName" => "ExampleFamilyName", "givenName" => "ExampleGivenName", "type" => "Person")
+      expect(subject.contributors[2]).to eq("contributorRoles" => ["ContactPerson"], "familyName" => "ExampleFamilyName", "givenName" => "ExampleGivenName", "type" => "Person")
+      expect(subject.date).to eq("created" => "2022-10-27", "published" => "2022", "registered" => "2022-10-27", "updated" => "2024-01-02")
+      expect(subject.publisher).to eq("name" => "Example Publisher")
+      expect(subject.titles).to eq([{ "language" => "en", "title" => "Example Title" },
+                                    { "language" => "en", "title" => "Example Subtitle", "type" => "Subtitle" },
+                                    { "language" => "fr",
+                                      "title" => "Example TranslatedTitle",
+                                      "type" => "TranslatedTitle" },
+                                    { "language" => "en",
+                                      "title" => "Example AlternativeTitle",
+                                      "type" => "AlternativeTitle" }])
+      expect(subject.descriptions).to eq([{ "description" => "Example Abstract",
+                                            "type" => "Abstract",
+                                            "language" => "en" },
+                                          { "description" => "Example Methods",
+                                            "type" => "Methods",
+                                            "language" => "en" },
+                                          { "description" => "Example SeriesInformation",
+                                            "type" => "Other",
+                                            "language" => "en" },
+                                          { "description" => "Example TableOfContents",
+                                            "type" => "Other",
+                                            "language" => "en" },
+                                          { "description" => "Example TechnicalInfo",
+                                            "type" => "TechnicalInfo",
+                                            "language" => "en" },
+                                          { "description" => "Example Other", "type" => "Other", "language" => "en" }])
+      expect(subject.license).to eq("id" => "CC-PDDC", "url" => "https://creativecommons.org/licenses/publicdomain/")
+    end
+    it "instrument" do
+      input = "#{fixture_path}datacite-instrument.json"
+      subject = described_class.new(input: input)
+      puts subject.errors unless subject.valid?
+      expect(subject.valid?).to be true
+      expect(subject.id).to eq("https://doi.org/10.82433/08qf-ee96")
+      expect(subject.type).to eq("Instrument")
+      expect(subject.contributors.length).to eq(2)
+      expect(subject.contributors.first).to eq("contributorRoles" => ["Author"], "name" => "DECTRIS", "type" => "Organization", "id" => "https://www.wikidata.org/wiki/Q107529885")
+      expect(subject.date).to eq("created" => "2022-10-20", "published" => "2022", "registered" => "2022-10-20", "updated" => "2024-01-02")
+      expect(subject.publisher).to eq("name" => "Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences")
+      expect(subject.license).to be_nil
+    end
   end
 end

data/spec/readers/json_feed_reader_spec.rb CHANGED Viewed

@@ -189,6 +189,31 @@ describe Commonmeta::Metadata, vcr: true do
       expect(subject.references).to be_nil
     end
+    it "medium post with institutional author" do
+      input = "https://api.rogue-scholar.org/posts/05f01f68-ef81-47d7-a3c1-40aba91d358f"
+      subject = described_class.new(input: input)
+      # expect(subject.valid?).to be true
+      expect(subject.id).to eq("https://doi.org/10.59350/jhrs4-22440")
+      expect(subject.url).to eq("https://medium.com/@researchgraph/unveiling-the-synergy-retrieval-augmented-generation-rag-meets-knowledge-graphs-fc0a6900f7eb")
+      expect(subject.alternate_identifiers).to eq([{ "alternateIdentifier" => "05f01f68-ef81-47d7-a3c1-40aba91d358f", "alternateIdentifierType" => "UUID" }])
+      expect(subject.type).to eq("Article")
+      expect(subject.contributors.length).to eq(1)
+      expect(subject.contributors.first).to eq("contributorRoles"=>["Author"], "name"=>"Research Graph", "type"=>"Organization")
+      expect(subject.titles).to eq([{ "title" => "Unveiling the Synergy: Retrieval Augmented Generation (RAG) Meets Knowledge Graphs" }])
+      expect(subject.license).to eq("id" => "CC-BY-4.0",
+                                    "url" => "https://creativecommons.org/licenses/by/4.0/legalcode")
+      expect(subject.date).to eq("published"=>"2024-01-18", "updated"=>"2024-01-18")
+      expect(subject.descriptions.first["description"]).to start_with("<strong> Tools and Platform for Integration of Knowledge Graph with RAG pipelines.")
+      expect(subject.publisher).to eq("name" => "Research Graph")
+      expect(subject.subjects).to eq([{ "subject" => "Computer and information sciences" },
+                                      { "schemeUri" => "http://www.oecd.org/science/inno/38235147.pdf",
+                                        "subject" => "FOS: Computer and information sciences",
+                                        "subjectScheme" => "Fields of Science and Technology (FOS)" }])
+      expect(subject.language).to eq("en")
+      expect(subject.container).to eq("identifier" => "https://medium.com/@researchgraph", "identifierType" => "URL", "title" => "Research Graph", "type" => "Periodical")
+      expect(subject.references).to be_nil
+    end
     it "syldavia gazette post with references" do
       input = "https://api.rogue-scholar.org/posts/0022b9ef-525a-4a79-81ad-13411697f58a"
       subject = described_class.new(input: input)

data/spec/utils_spec.rb CHANGED Viewed

@@ -502,7 +502,7 @@ describe Commonmeta::Metadata do
     links = [{ "rel" => "self", "type" => "application/atom+xml", "href" => "https://syldavia-gazette.org/atom/" },
              { "rel" => "alternate", "type" => "text/html", "href" => "https://syldavia-gazette.org" },
              { "rel" => "license", "type" => "text/html", "href" => "https://creativecommons.org/licenses/by/4.0/legalcode" }]
     it "url" do
       response = subject.get_link(links, "self")
       expect(response).to eq("https://syldavia-gazette.org/atom/")
@@ -721,7 +721,7 @@ describe Commonmeta::Metadata do
     it "decode doi to uuid" do
       doi = "https://doi.org/10.53731/6315bn4-aqg82ja-4a9wxdt-29f7279"
       response = subject.decode_doi(doi, uuid: true)
-      expect(response).to eq('255d48ab-c102-9288-a4f3-add092f388e9')
+      expect(response).to eq("255d48ab-c102-9288-a4f3-add092f388e9")
     end
   end
@@ -745,10 +745,36 @@ describe Commonmeta::Metadata do
     end
   end
-  context 'json_feed_unregistered_url' do
-    it 'all posts' do
+  context "json_feed_unregistered_url" do
+    it "all posts" do
       response = subject.json_feed_unregistered_url
       expect(response).to eq("https://api.rogue-scholar.org/posts/unregistered")
     end
   end
+  context "normalize_name_identifier" do
+    it "ORCID" do
+      hsh = {"schemeUri"=>"https://orcid.org", "nameIdentifier"=>"https://orcid.org/0000-0003-1419-2405", "nameIdentifierScheme"=>"ORCID"}
+      response = subject.normalize_name_identifier(hsh)
+      expect(response).to eq("https://orcid.org/0000-0003-1419-2405")
+    end
+    it "ROR" do
+      hsh = { "schemeUri" => "https://ror.org", "nameIdentifier" => "https://ror.org/02aj13c28", "nameIdentifierScheme" => "ROR" }
+      response = subject.normalize_name_identifier(hsh)
+      expect(response).to eq("https://ror.org/02aj13c28")
+    end
+    it "ISNI" do
+      hsh = { "schemeUri" => "http://isni.org/isni/", "nameIdentifier" => "0000000134596520", "nameIdentifierScheme" => "ISNI" }
+      response = subject.normalize_name_identifier(hsh)
+      expect(response).to eq("https://isni.org/isni/0000000134596520")
+    end
+    it "Wikidata" do
+      hsh = {"schemeUri"=>"https://www.wikidata.org/wiki/", "nameIdentifier"=>"Q107529885", "nameIdentifierScheme"=>"Wikidata"}
+      response = subject.normalize_name_identifier(hsh)
+      expect(response).to eq("https://www.wikidata.org/wiki/Q107529885")
+    end
+  end
 end

data/spec/writers/commonmeta_writer_spec.rb CHANGED Viewed

@@ -33,11 +33,38 @@ describe Commonmeta::Metadata, vcr: true do
                                              "volume" => "426",
                                              "firstPage" => "181",
                                              "containerTitle" => "Nature")
-      expect(json["date"]).to eq("published"=>"2014-02-11", "updated"=>"2022-03-26")
+      expect(json["date"]).to eq("published" => "2014-02-11", "updated" => "2022-03-26")
       expect(json["descriptions"].first["description"]).to start_with("Among various advantages,")
-      expect(json["license"]).to eq("id"=>"CC-BY-3.0", "url"=>"https://creativecommons.org/licenses/by/3.0/legalcode")
+      expect(json["license"]).to eq("id" => "CC-BY-3.0", "url" => "https://creativecommons.org/licenses/by/3.0/legalcode")
       expect(json["provider"]).to eq("Crossref")
-      expect(json["files"].first).to eq("mimeType"=>"application/pdf", "url"=>"https://cdn.elifesciences.org/articles/01567/elife-01567-v1.pdf")
+      expect(json["files"].first).to eq("mimeType" => "application/pdf", "url" => "https://cdn.elifesciences.org/articles/01567/elife-01567-v1.pdf")
+    end
+    it "dataset schema v4.5" do
+      input = "#{fixture_path}datacite-dataset_v4.5.json"
+      subject = described_class.new(input: input)
+      expect(subject.id).to eq("https://doi.org/10.82433/b09z-4k37")
+      json = JSON.parse(subject.commonmeta)
+      expect(json["id"]).to eq("https://doi.org/10.82433/b09z-4k37")
+      expect(json["type"]).to eq("Dataset")
+      expect(json["titles"]).to eq([{ "language" => "en", "title" => "Example Title" },
+                                    { "language" => "en", "title" => "Example Subtitle", "type" => "Subtitle" },
+                                    { "language" => "fr",
+                                      "title" => "Example TranslatedTitle",
+                                      "type" => "TranslatedTitle" },
+                                    { "language" => "en",
+                                      "title" => "Example AlternativeTitle",
+                                      "type" => "AlternativeTitle" }])
+      expect(json["descriptions"]).to eq([{ "description" => "Example Abstract", "language" => "en", "type" => "Abstract" },
+                                          { "description" => "Example Methods", "language" => "en", "type" => "Methods" },
+                                          { "description" => "Example SeriesInformation",
+                                            "language" => "en",
+                                            "type" => "Other" },
+                                          { "description" => "Example TableOfContents", "language" => "en", "type" => "Other" },
+                                          { "description" => "Example TechnicalInfo",
+                                            "language" => "en",
+                                            "type" => "TechnicalInfo" },
+                                          { "description" => "Example Other", "language" => "en", "type" => "Other" }])
     end
   end
 end

data/spec/writers/csl_writer_spec.rb CHANGED Viewed

@@ -7,6 +7,7 @@ describe Commonmeta::Metadata, vcr: true do
     it 'Dataset' do
       input = 'https://doi.org/10.5061/DRYAD.8515'
       subject = described_class.new(input: input, from: 'datacite')
+      puts subject.errors unless subject.valid?
       expect(subject.valid?).to be true
       json = JSON.parse(subject.csl)
       expect(json['type']).to eq('dataset')

data/spec/writers/csv_writer_spec.rb CHANGED Viewed

@@ -37,6 +37,7 @@ describe Commonmeta::Metadata, vcr: true do
     it 'text' do
       input = 'https://doi.org/10.3204/desy-2014-01645'
       subject = described_class.new(input: input, from: 'datacite')
+      puts subject.errors unless subject.valid?
       expect(subject.valid?).to be true
       csv = subject.csv.parse_csv

data/spec/writers/datacite_writer_spec.rb CHANGED Viewed

@@ -136,7 +136,6 @@ describe Commonmeta::Metadata, vcr: true do
     it 'from schema.org' do
       input = 'https://blog.front-matter.io/posts/eating-your-own-dog-food/'
       subject = described_class.new(input: input, from: 'schema_org')
-      puts subject.errors
       expect(subject.valid?).to be true
       datacite = JSON.parse(subject.datacite)
       expect(datacite.fetch('titles')).to eq([{ 'title' => 'Eating your own Dog Food' }])