commonmeta-ruby 3.9.0 → 3.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +12 -11
  3. data/lib/commonmeta/author_utils.rb +12 -5
  4. data/lib/commonmeta/readers/commonmeta_reader.rb +1 -1
  5. data/lib/commonmeta/readers/datacite_reader.rb +120 -108
  6. data/lib/commonmeta/schema_utils.rb +1 -1
  7. data/lib/commonmeta/utils.rb +47 -2
  8. data/lib/commonmeta/version.rb +1 -1
  9. data/lib/commonmeta/writers/commonmeta_writer.rb +1 -1
  10. data/resources/{commonmeta_v0.10.5.json → commonmeta_v0.10.7.json} +21 -5
  11. data/resources/{datacite-v4.json → datacite-v45.json} +26 -5
  12. data/resources/kernel-4/include/datacite-relationType-v4.xsd +2 -0
  13. data/resources/kernel-4/include/datacite-resourceType-v4.xsd +2 -0
  14. data/resources/kernel-4/metadata.xsd +11 -7
  15. data/spec/author_utils_spec.rb +10 -0
  16. data/spec/fixtures/commonmeta.json +1 -1
  17. data/spec/fixtures/datacite-dataset_v4.5.json +736 -0
  18. data/spec/fixtures/datacite-instrument.json +135 -0
  19. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_datacite_metadata/SoftwareSourceCode.yml +8 -8
  20. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_datacite_metadata/dissertation.yml +12 -12
  21. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_datacite_metadata/funding_references.yml +12 -12
  22. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_datacite_metadata/subject_scheme.yml +22 -22
  23. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/medium_post_with_institutional_author.yml +317 -0
  24. data/spec/readers/commonmeta_reader_spec.rb +1 -1
  25. data/spec/readers/datacite_reader_spec.rb +68 -14
  26. data/spec/readers/json_feed_reader_spec.rb +25 -0
  27. data/spec/utils_spec.rb +30 -4
  28. data/spec/writers/commonmeta_writer_spec.rb +30 -3
  29. data/spec/writers/csl_writer_spec.rb +1 -0
  30. data/spec/writers/csv_writer_spec.rb +1 -0
  31. data/spec/writers/datacite_writer_spec.rb +0 -1
  32. metadata +7 -4
@@ -0,0 +1,317 @@
1
+ ---
2
+ http_interactions:
3
+ - request:
4
+ method: get
5
+ uri: https://api.rogue-scholar.org/posts/05f01f68-ef81-47d7-a3c1-40aba91d358f
6
+ body:
7
+ encoding: ASCII-8BIT
8
+ string: ''
9
+ headers:
10
+ Connection:
11
+ - close
12
+ Host:
13
+ - api.rogue-scholar.org
14
+ User-Agent:
15
+ - http.rb/5.1.1
16
+ response:
17
+ status:
18
+ code: 200
19
+ message: OK
20
+ headers:
21
+ Content-Type:
22
+ - application/json
23
+ Content-Length:
24
+ - '23886'
25
+ Ratelimit-Limit:
26
+ - '15'
27
+ Ratelimit-Remaining:
28
+ - '14'
29
+ Ratelimit-Reset:
30
+ - '3'
31
+ Date:
32
+ - Wed, 31 Jan 2024 19:50:01 GMT
33
+ Server:
34
+ - Fly/ba9e227a (2024-01-26)
35
+ Via:
36
+ - 1.1 fly.io
37
+ Fly-Request-Id:
38
+ - 01HNGH4EZV3XQF20H1PZ6X5N07-fra
39
+ body:
40
+ encoding: UTF-8
41
+ string: '{"abstract":null,"archive_url":null,"authors":[{"name":"Research Graph"}],"blog":{"api":false,"archive_prefix":null,"authors":null,"backlog":0,"canonical_url":null,"category":"computerAndInformationSciences","created_at":1706685423,"current_feed_url":null,"description":"Stories
42
+ by Research Graph on Medium","favicon":"https://cdn-images-1.medium.com/fit/c/150/150/1*laJi0jBkVoGhXid7gD_DmQ.png","feed_format":"application/rss+xml","feed_url":"https://medium.com/@researchgraph/feed","filter":null,"funding":null,"generator":"Medium","generator_raw":"Medium","home_page_url":"https://medium.com/@researchgraph","id":"30da2ca9-8258-4ab5-acca-3919d9a5d98d","indexed":true,"issn":null,"language":"en","license":"https://creativecommons.org/licenses/by/4.0/legalcode","mastodon":"","plan":"Starter","prefix":"10.59350","relative_url":null,"ror":null,"secure":true,"slug":"researchgraph","status":"active","title":"Research
43
+ Graph","updated_at":1706151454,"use_api":null,"use_mastodon":false,"user_id":"a7e16958-1175-437c-b839-d4b8a47ec811","version":"https://jsonfeed.org/version/1.1"},"blog_name":"Research
44
+ Graph","blog_slug":"researchgraph","content_text":"**Tools and Platform for
45
+ Integration of Knowledge Graph with RAG\npipelines.**\n\n<figure>\n<img\nsrc=\"https://cdn-images-1.medium.com/max/1024/1*bJ3eWZ7301vYDzBomwdLfQ.png\"\nalt=\"Complex
46
+ network connected to books and showing information from magespace\" />\n<figcaption>Image
47
+ Created in <a\nhref=\"https://www.mage.space/\">https://www.mage.space/</a></figcaption>\n</figure>\n\nAuthors:
48
+ [Aland\nAstudillo](https://www.linkedin.com/in/aland-astudillo/), [Aishwarya\nNambissan](https://www.linkedin.com/in/aishwarya-nambissan-127229200/)\n\nMany
49
+ users of chatbots such as ChatGPT, have encountered the problem of\nreceiving
50
+ inappropriate or incompatible responses. There are several\nreasons why this
51
+ might\u00a0happen.\n\nOne reason is the lack of appropriate training data,
52
+ as chatbots are\nusually trained on large amounts of text and code. If the
53
+ data is\ninsufficient or of poor quality, the chatbot may misunderstand queries\nand
54
+ provide inaccurate responses. Another reason is that some chatbots\nare designed
55
+ for specific tasks or domains, which limits their ability\nto handle broader
56
+ queries or understand subtle nuances in conversation.\nAdditionally, chatbots
57
+ may struggle with natural language, which is\ncomplex and often ambiguous.
58
+ This can cause them to misunderstand a\nuser''s query and provide irrelevant
59
+ or off-topic responses. Finally,\nthere are technical limitations, such as
60
+ the chatbot''s inability to\nreason or make inferences.\n\nThis article explores
61
+ a potential solution by combining two influential\napproaches in the field
62
+ of Natural Language Processing\u200a---\u200aRetrieval\nAugmented Generation
63
+ (**RAG**) and Knowledge Graphs(**KGs**). We will\ndelve into the partnership
64
+ between these two entities, discuss the\nnotable technologies and software
65
+ used in their processes, and highlight\nvarious options for utilizing their
66
+ combined potential.\n\n### **RAG**\n\nRetrieval-Augmented Generation is the
67
+ process of optimizing the output\nof a large language model using a knowledge
68
+ base outside of its training\ndata sources before generating a response. It
69
+ takes an input and\nretrieves a set of relevant/supporting documents given
70
+ a source (e.g.,\nWikipedia). This can be thought of as a Large Language Model
71
+ (LLM) not\njust putting words together, but carefully selecting relevant\ninformation
72
+ from external sources and Knowledge Graphs to create\nwell-informed and detailed
73
+ responses.\n\n### RAG Retrieval Techniques\n\nThe following are some crucial
74
+ technologies that enable RAG''s impressive\nability to retrieve and incorporate
75
+ relevant information:\n\n**Vector Search**: It transforms text into numerical
76
+ vectors, capturing\ntheir meaning and nuances in a mathematical space, creating
77
+ a map of\nrelationships. Similar texts, like those discussing shared topics
78
+ or\nusing similar language, end up positioned close together in this space,\nallowing
79
+ vector search to quickly identify them as related. This allows\nlightning-fast
80
+ comparisons, finding similar texts based on meaning, not\njust keywords.\n\nAlgorithms
81
+ like [**Faiss**](https://github.com/facebookresearch/faiss)\nand [**Annoy**](https://github.com/spotify/annoy)
82
+ map text into dense\nvectors, enabling fast comparisons and retrieval of relevant
83
+ passages\nbased on semantic similarity.\n\n**Passage Ranking**: It is an internal
84
+ algorithm that scores candidate\ntext passages based on their relevance to
85
+ a query. It considers factors\nlike keyword frequency, keyword overlap, and
86
+ document structure to act\nlike a judge, sifting through information to select
87
+ the most fitting and\ninformative passages.\n\nKeyword overlap measures how
88
+ often the same keywords appear in **both**\nthe query and the candidate passage,
89
+ emphasizing shared vocabulary and\npotential relevance. It differs from keyword
90
+ frequency, which simply\ncounts how often individual keywords appear within
91
+ a passage, regardless\nof their presence in the\u00a0query.\n\nTechniques
92
+ like [**BM25**](https://github.com/getalp/wikIR) and\n[**TF-IDF**](https://github.com/marcocor/wikipedia-idf)
93
+ score candidate\npassages based on keyword overlap and frequency, ensuring
94
+ retrieved\ninformation truly fits the\u00a0context.\n\n**Graph Neural Networks**
95
+ (**GNNs**): They are neural networks designed\nto explore and learn from interconnected
96
+ data like maps, social\nnetworks, and other complex relationships. Unlike
97
+ traditional processing\nmethods that go through data in a linear fashion,
98
+ GNNs are capable of\nrecognizing hidden patterns and understanding relationships
99
+ like \"who\nknows who\" and \"what connects to what\" by \"hopping\" across
100
+ connections\nin\u00a0data.\n\nConsider a graph as a network of dots(nodes)
101
+ connected by lines (edges).\nEach dot represents some information, like a
102
+ person, object, or concept.\nThe lines tell you how these things relate to
103
+ each\u00a0other.\n\nGNNs work in rounds. In each\u00a0round:\n\n1. Message
104
+ Passing: Each node \"talks\" to its neighbors, sending\n messages along
105
+ the edges. These messages contain information about\n the node itself and
106
+ its features.\n2. Node Update: Each node receives messages from all its neighbors
107
+ and\n combines them with its own information. This update can involve\n calculations
108
+ and applying a special function.\n3. Output Calculation: Based on the updated
109
+ information, the network\n calculates an output for each node. This output
110
+ could be a\n prediction about the node''s category, its relationship to
111
+ another\n node, or some other relevant information.\n\nThis process repeats
112
+ for multiple rounds, allowing nodes to incorporate\ninformation from their
113
+ entire neighborhood, not just their direct\nneighbors. As the rounds progress,
114
+ the network learns to understand the\nrelationships between nodes and the
115
+ overall structure of the\u00a0graph.\n\nWhen dealing with Knowledge Graphs,
116
+ frameworks like\n[**PyTorch-Geometric**](https://readthedocs.org/projects/pytorch-geometric/)\nand
117
+ [**DeepMind''s\nGNN**](https://github.com/deepmind/deepmind-research/blob/master/learning_to_simulate/graph_network.py)\nlibrary
118
+ come into play. These frameworks allow GNNs to traverse\ninterconnected entities
119
+ and relationships within the graph, retrieve\nrelevant knowledge fragments,
120
+ and understand complex connections.\n\n### **Knowledge Graphs: The Structured
121
+ Wisdom\u00a0Library**\n\nA knowledge graph, also referred to as a semantic
122
+ network, is a\nstructure that represents a network of real-world entities
123
+ such as\nobjects, events, situations, or concepts. It helps to illustrate
124
+ the\nconstantly changing representations of the world, connecting entities\n(such
125
+ as \"Marie Curie\") and relationships (such as \"won Nobel Prize\") to\nform
126
+ a complex network of information. This information is typically\nstored in
127
+ a graph database and visualized as a graph structure, thus the\nterm knowledge
128
+ \"graph\".\n\nKGs go beyond simply finding relevant facts and delve deeper
129
+ into\nunderstanding the relationships and insights hidden within using these\nprocesses:\n\n**Entity
130
+ Linking**: Imagine a vast network of information, like a big\npuzzle of dots.
131
+ Now imagine trying to connect specific names, places,\nand concepts to their
132
+ corresponding dots in the puzzle. That is what\nentity linking does with text
133
+ and knowledge graphs, connecting the\nspecific components of the text to the
134
+ corresponding nodes in the graph.\nThey help systems understand the exact
135
+ meaning of entities, and find\nrelevant information from the\u00a0graph.\n\nLibraries
136
+ like [**DGL-KeLP**](https://github.com/awslabs/dgl-ke)\nleverage GNNs to identify
137
+ and link named entities (like \"Marie Curie\")\nto their respective nodes
138
+ within the Knowledge Graphs, enabling RAG to\nretrieve information that is
139
+ directly relevant to the core subject of a\nsearch\u00a0query\n\n**Path Mining**:
140
+ Path mining is a process of uncovering hidden\nrelationships and patterns
141
+ that are not easily noticeable. It involves\nexploring complicated networks
142
+ of information and identifying and\ntracing connections between entities that
143
+ may seem unrelated. By doing\nso, path mining reveals surprising insights
144
+ and useful knowledge,\nimproving our understanding of the complex structures
145
+ within knowledge\ngraphs.\n\nTools like [**Neo4j**](https://neo4j.com/) and\n[**Stanza**](https://github.com/stanfordnlp/stanza)
146
+ allow traversing\npaths between entities, uncovering hidden relationships,
147
+ and generating\ninsightful responses based on this deeper understanding.\n\n**Reasoning
148
+ and Inference**: In the context of knowledge graphs,\nreasoning and inference
149
+ are not just limited to discovering facts; they\nare also concerned with utilizing
150
+ them effectively. This involves\nintegrating data, drawing meaningful connections,
151
+ and using logical\nreasoning to resolve issues, foresee future occurrences,
152
+ or even\nconstruct narratives leveraging the insights provided by the knowledge\ngraph.\n\nConsider
153
+ the scenario of trying to find an organization that works in\nspecific sectors
154
+ with the help of a knowledge graph. This analogy\neffectively highlights the
155
+ active role of reasoning and inference in\nknowledge graphs:\n\n1. Gathering
156
+ Facts: Knowledge graphs collect and organize information\n from various
157
+ sources, such as websites, databases, academic papers,\n and social media
158
+ platforms. These facts are represented as\n structured data, with entities
159
+ (e.g., organizations) and their\n attributes (e.g., sectors in which they
160
+ operate) forming nodes and\n edges in the graph. By combining data about
161
+ organizations and\n sectors, knowledge graphs enable the gathering of relevant
162
+ facts for\n analysis.\n2. Integrating information: By connecting an organization''s\n relationships
163
+ with specific sectors, such as partnerships,\n investments, or certifications,
164
+ knowledge graphs reveal the scope\n and relevance of their work within
165
+ those sectors. Links to related\n entities like employees, board members,
166
+ or projects can further\n contribute to understanding an organization''s
167
+ involvement in\n specific\u00a0sectors.\n3. Predicting and Creating: Knowledge
168
+ graphs can leverage machine\n learning and predictive models to infer missing
169
+ or hidden\n information. By analyzing the available facts and connections
170
+ within\n the graph, these models can predict an organization''s potential\n involvement
171
+ in sectors that have common attributes with their known\n areas of operation.
172
+ For example, if an organization has expertise in\n renewable energy, predictive
173
+ models could suggest their likely\n involvement in related sectors like
174
+ clean transportation or\n sustainable infrastructure. Additionally, knowledge
175
+ graphs\n facilitate the creation of new information and insights by combining\n existing
176
+ facts with external data sources. For instance, by\n integrating real-time
177
+ data on industry trends, market analysis, or\n news articles, knowledge
178
+ graphs enable the discovery of emerging\n sectors or upcoming organizations
179
+ that might align with the given\n parameters.\n\nA framework like [**Atomspace**](https://github.com/opencog/atomspace)\nfrom
180
+ [**OpenCog**](https://opencog.org/) empowers RAG to reason and\ninfer new
181
+ knowledge. By traversing paths and combining information from\ninterconnected
182
+ entities, the system can generate informed predictions or\nanswer hypothetical
183
+ questions.\n\n### Purpose\n\nThe combination of Retrieval-Augmented Generation
184
+ (RAG) and Knowledge\nGraphs (KG) is beneficial for several\u00a0reasons:\n\n1. **Enhanced
185
+ information retrieval**: Knowledge graphs provide\n structured and interconnected
186
+ information that can significantly\n improve the effectiveness of information
187
+ retrieval. By using KGs,\n RAG models can retrieve more accurate and relevant
188
+ information,\n leading to better generation and response\u00a0quality.\n2. **Reliable
189
+ and diverse information:** KGs are constructed from\n authoritative sources,
190
+ making them reliable and trustworthy sources\n of information. RAG models
191
+ can leverage this reliable information to\n generate more accurate responses.
192
+ Additionally, KGs help in\n diversifying the generated responses by providing
193
+ a broader pool of\n related facts and entities.\n3. **Context-aware understanding**:
194
+ KGs enable RAG models to understand\n and reason over the contextual information.
195
+ By leveraging the\n relationships and semantic connections encoded in KGs,
196
+ RAG models\n can better grasp the context of user queries or conversations,\n resulting
197
+ in more coherent and appropriate responses.\n4. **Handling complex queries**:
198
+ KGs allow RAG models to tackle complex\n queries by breaking them down
199
+ into smaller sub-queries, retrieving\n relevant pieces of information from
200
+ the KG, and then generating a\n response based on the retrieved knowledge.
201
+ This enables RAG models\n to handle a wide range of user queries effectively.\n5. **Explainability
202
+ and transparency**: KGs provide a transparent and\n interpretable representation
203
+ of knowledge. By integrating KG-based\n retrieval into RAG models, the
204
+ reasoning behind the generated\n responses becomes more explainable. Users
205
+ can have a clear\n understanding of the knowledge sources and connections
206
+ used to\n produce the response.\n6. **Scalability**: Knowledge graphs
207
+ act as large-scale repositories of\n information. RAG models can leverage
208
+ KGs to generate responses to\n various queries or conversations without
209
+ requiring additional\n supervised training data. This makes the RAG+KG
210
+ approach scalable to\n handle an extensive range of knowledge domains and
211
+ user\u00a0queries.\n\n### **Pipeline Possibilities: Orchestrating RAG and\u00a0KGs:**\n\nLet''s
212
+ explore some exciting pipeline options for harnessing the combined\npower
213
+ of RAG and Knowledge Graphs. There are two options in which either\nthe LLM
214
+ is prioritized or the Knowledge Graph is prioritized:\n\n**Option 1: LLM-Centric
215
+ Pipeline:**\n\nThe LLM-Centric pipeline is a RAG and Knowledge Graph combination
216
+ that\nempowers LLMs to craft well-informed responses. Here''s how it\u00a0works:\n\n1. Start
217
+ with the user''s question or statement\n2. The LLM (like GPT-3) generates
218
+ an initial draft response based on\n its internal knowledge. This draft
219
+ may lack specific factual details\n or nuances that a knowledge graph can\u00a0provide.\n3. RAG
220
+ kicks in, searching the text corpus or the Knowledge Graph for\n relevant
221
+ passages that enrich the draft. During the retrieval\n process, RAG retrieval
222
+ techniques are used to search not only text\n corpora but also knowledge
223
+ graphs to find relevant information. This\n means that RAG can directly
224
+ tap into the structured knowledge within\n the graph to retrieve facts,
225
+ relationships, and entities that align\n with the user''s query and the
226
+ LLM''s generated draft.\n4. The retrieved information is carefully fused
227
+ with the LLM''s output,\n creating a more factually accurate and insightful
228
+ response\n5. A final polishing step ensures the response is fluent, grammatically\n correct,
229
+ and ready to\u00a0show.\n\n<figure>\n<img\nsrc=\"https://cdn-images-1.medium.com/max/1024/0*3pd9MOIflkbS07wI\"
230
+ />\n<figcaption>RAG LLM-centric generic\u00a0scheme.</figcaption>\n</figure>\n\nThe
231
+ basic steps to perform this\u00a0are:\n\n1. **Pre-processing**: Clean and
232
+ tokenize user input to prepare for\n processing.\n2. **LLM Generation**:
233
+ Generate an initial draft response using an LLM\n like [**GPT-3**](https://openai.com/product)
234
+ or [**Jurassic-1\n Jumbo**](https://www.livescience.com/google-sentient-ai-lamda-lemoine).\n3. **Retrieval**:
235
+ Employ RAG techniques to retrieve relevant passages\n from a text corpus
236
+ or Knowledge Graphs.\n4. **Fusion**: Integrate retrieved information into
237
+ the LLM-generated\n draft, creating a more informed and factually-grounded
238
+ response.\n5. **Post-processing**: Refine the final response for fluency,\n grammatical
239
+ correctness, and overall coherence.\n\n**Option 2: Knowledge Graphs-Centric
240
+ Pipeline:**\n\nIn this approach, knowledge graphs take center stage. In essence,
241
+ this\npipeline prioritizes the structured knowledge within knowledge graphs,\nusing
242
+ RAG retrieval techniques to translate those insights into\ncompelling and
243
+ informative language. Here''s how it\u00a0unfolds:\n\n1. User input: The
244
+ process begins with the user''s question or statement\n2. Graph exploration:
245
+ The knowledge graph is meticulously explored to\n identify relevant entities,
246
+ relationships, and paths that align with\n the user''s input. This stage
247
+ involves techniques like entity\n linking, path mining, and reasoning to
248
+ uncover valuable information\n within the\u00a0graph\n3. Response planning:
249
+ The insights extracted from the graph are used to\n create a structured
250
+ response plan. This plan outlines the key\n points, facts, and logical
251
+ flow that the final response\n should\u00a0embody\n4. Language generation:
252
+ This is where RAG steps in. Its purpose is to\n create human-like text
253
+ that follows the response plan. It uses LLMs\n to produce well-written
254
+ sentences and paragraphs, combining the\n relevant information from the
255
+ knowledge graph while maintaining\n cohesiveness and readability.\n5. Post-processing:
256
+ The generated response undergoes a final refinement\n process to ensure
257
+ grammatical correctness, clarity, and\n overall\u00a0quality\n\n<figure>\n<img\nsrc=\"https://cdn-images-1.medium.com/max/1024/0*mZ83esKBjbPmCq_C\"
258
+ />\n<figcaption>RAG Knowledge Graph-centric generic\u00a0scheme.</figcaption>\n</figure>\n\nThe
259
+ basic steps\u00a0are:\n\n1. **Query Formulation**: Transform the user input
260
+ into a query\n suitable for Knowledge Graph''s exploration.\n2. **Knowledge
261
+ Graphs:** You can use either Neo4j or\n [NebulaGraph](https://www.nebula-graph.io/)
262
+ to implement a retrieval\n enhancement technique. This technique involves
263
+ utilizing a knowledge\n graph to illustrate the connections between entities
264
+ and\n relationships. Additionally, it incorporates a powerful language\n model
265
+ to improve the retrieval process.\n3. **Fact Selection**: Employ entity linking
266
+ and reasoning algorithms\n to select and prioritize the most relevant facts
267
+ based on the query\n and\u00a0context.\n4. **Natural Language Generation**
268
+ (**NLG**): Utilise specialized NLG\n models like\n [BART](https://research.facebook.com/publications/controllable-abstractive-summarization/)\n to
269
+ translate the extracted facts into a natural language response.\n5. **Refinement**:
270
+ Enhance the generated response for clarity and\n coherence.\n\n### **Unveiling
271
+ a Future of Intelligent Interaction**\n\nThe combination of RAG and Knowledge
272
+ Graphs goes beyond just being a\ntechnological fusion. It paves the way for
273
+ a future where the\ninteraction between humans and computers goes beyond simple
274
+ words and\nbecomes a more informed and refined form of communication. As these\ntechnologies
275
+ continue to develop, we can expect to witness a significant\ntransformation
276
+ in:\n\n- AI-powered assistants that answer your questions with the confidence\n of
277
+ a well-read friend, seamlessly combining relevant facts and\n insights gleaned
278
+ from Knowledge Graphs.\n- Next-generation search engines that go beyond keyword
279
+ matching,\n understanding the deeper meaning behind your queries and delivering\n comprehensive,
280
+ contextual results enriched with information from\n Knowledge Graphs.\n-
281
+ Creative writing tools that utilize RAG and Knowledge Graphs to\n generate
282
+ stories that are both factually accurate and full of\n unexpected plot twists
283
+ and character development, moving beyond\n clich\u00e9d patterns.\n\n###
284
+ **Conclusion**\n\nThe convergence of Retrieval Augmented Generation (RAG)
285
+ and Knowledge\nGraphs (KGs) brings about an exciting synergy in the world
286
+ of Natural\nLanguage Processing (NLP). RAG enhances the output of large language\nmodels
287
+ by carefully selecting relevant information from external sources\nand KGs,
288
+ allowing for well-informed and detailed responses. KGs, on the\nother hand,
289
+ provide a structured representation of real-world entities\nand their relationships,
290
+ enabling the exploration of hidden insights and\nthe discovery of complex
291
+ connections.\n\nThe integration of RAG and KGs opens up two pipeline possibilities.
292
+ The\nLLM-centric pipeline prioritizes the language model''s output, which
293
+ is\nthen enriched with information retrieved from KGs. The Knowledge\nGraphs-centric
294
+ pipeline, on the other hand, places KGs at the center,\nutilizing RAG techniques
295
+ to translate the structured insights into\ncompelling and informative language.\n\nWhile
296
+ integrating LLMs and a knowledge graph for content retrieval\nrequires careful
297
+ planning, the reward is significant. You can gain\naccess to hidden relationships
298
+ within information, ultimately leading to\nhigher-quality output information.\n\nTools
299
+ like **OpenAI**, **Langchain**, and **LlamaIndex** provide\nready-made pipelines
300
+ to integrate knowledge graphs (like **Neo4j**)\neasily. Meanwhile, open-source
301
+ LLMs like **Mistral**, **Llama**, and\n**Dolphin** are catching up to proprietary
302
+ models in performance, making\nthem attractive choices for building custom
303
+ architectures. This\nopen-source scenario allows for the exploration and examination
304
+ of\nvarious methods before fully committing to a particular technological\nframework.
305
+ So, it is crucial to evaluate your needs and choose the\napproach that best
306
+ fits your use\u00a0case.\n\n![](https://medium.com/_/stat?event=post.clientViewed&referrerSource=full_rss&postId=fc0a6900f7eb){width=\"1\"\nheight=\"1\"}\n","doi":"https://doi.org/10.59350/jhrs4-22440","guid":"https://medium.com/p/fc0a6900f7eb","id":"05f01f68-ef81-47d7-a3c1-40aba91d358f","image":"https://cdn-images-1.medium.com/max/1024/1*bJ3eWZ7301vYDzBomwdLfQ.png","indexed_at":1706690571,"language":"en","published_at":1705557796,"reference":[],"relationships":[],"summary":"<strong>\n
307
+ Tools and Platform for Integration of Knowledge Graph with RAG pipelines.\n</strong>\nAuthors:
308
+ Aland Astudillo, Aishwarya Nambissan Many users of chatbots such as ChatGPT,
309
+ have encountered the problem of receiving inappropriate or incompatible responses.
310
+ There are several reasons why this might\u00a0happen. One reason is the lack
311
+ of appropriate training data, as chatbots are usually trained on large amounts
312
+ of text and code.","tags":["Artificial-intelligence","Machine-learning","Retrieval-augmented","Knowledge-graph"],"title":"Unveiling
313
+ the Synergy: Retrieval Augmented Generation (RAG) Meets Knowledge Graphs","updated_at":1705557796,"url":"https://medium.com/@researchgraph/unveiling-the-synergy-retrieval-augmented-generation-rag-meets-knowledge-graphs-fc0a6900f7eb"}
314
+
315
+ '
316
+ recorded_at: Wed, 31 Jan 2024 19:50:01 GMT
317
+ recorded_with: VCR 6.2.0
@@ -10,7 +10,7 @@ describe Commonmeta::Metadata, vcr: true do
10
10
  context 'read commonmeta metadata' do
11
11
  it "default" do
12
12
  expect(subject.valid?).to be true
13
- expect(subject.schema_version).to eq("https://commonmeta.org/commonmeta_v0.10.5.json")
13
+ expect(subject.schema_version).to eq("https://commonmeta.org/commonmeta_v0.10")
14
14
  expect(subject.id).to eq("https://doi.org/10.7554/elife.01567")
15
15
  expect(subject.type).to eq("JournalArticle")
16
16
  expect(subject.url).to eq("https://elifesciences.org/articles/01567")
@@ -36,13 +36,12 @@ describe Commonmeta::Metadata, vcr: true do
36
36
  # expect(subject.valid?).to be true
37
37
  expect(subject.id).to eq("https://doi.org/10.5063/f1m61h5x")
38
38
  expect(subject.type).to eq("Software")
39
- expect(subject.contributors).to eq([{"contributorRoles"=>["Author"],
40
- "name"=>
41
- "Jones, Matthew B.; Slaughter, Peter; Nahf, Rob; Boettiger, Carl ; Jones, Chris; Read, Jordan; Walker, Lauren; Hart, Edmund; Chamberlain, Scott",
42
- "type"=>"Organization"}])
43
- expect(subject.titles).to eq([{"title"=>"dataone: R interface to the DataONE network of data repositories"}])
44
- expect(subject.date).to eq("created"=>"2016-03-12", "published"=>"2016", "registered"=>"2016-03-12", "updated"=>"2020-09-18")
45
- expect(subject.publisher).to eq("name"=>"KNB Data Repository")
39
+ expect(subject.contributors).to eq([{ "contributorRoles" => ["Author"],
40
+ "name" => "Jones, Matthew B.; Slaughter, Peter; Nahf, Rob; Boettiger, Carl ; Jones, Chris; Read, Jordan; Walker, Lauren; Hart, Edmund; Chamberlain, Scott",
41
+ "type" => "Organization" }])
42
+ expect(subject.titles).to eq([{ "title" => "dataone: R interface to the DataONE network of data repositories" }])
43
+ expect(subject.date).to eq("created" => "2016-03-12", "published" => "2016", "registered" => "2016-03-12", "updated" => "2020-09-18")
44
+ expect(subject.publisher).to eq("name" => "KNB Data Repository")
46
45
  expect(subject.provider).to eq("DataCite")
47
46
  end
48
47
 
@@ -70,8 +69,8 @@ describe Commonmeta::Metadata, vcr: true do
70
69
  expect(subject.type).to eq("Dissertation")
71
70
  expect(subject.contributors.length).to eq(3)
72
71
  expect(subject.contributors.first).to eq("type" => "Person", "contributorRoles" => ["Author"],
73
- "givenName" => "Heiko", "familyName" => "Conrad")
74
- expect(subject.contributors.last).to eq("id"=>"https://orcid.org/0000-0002-8633-8234", "type"=>"Person", "contributorRoles"=>["Supervision"], "givenName"=>"Gerhard", "familyName"=>"Gruebel", "affiliation"=>[{"name"=>"Deutsches Elektronen-Synchrotron"}])
72
+ "givenName" => "Heiko", "familyName" => "Conrad")
73
+ expect(subject.contributors.last).to eq("id" => "https://orcid.org/0000-0002-8633-8234", "type" => "Person", "contributorRoles" => ["Supervision"], "givenName" => "Gerhard", "familyName" => "Gruebel", "affiliation" => [{ "name" => "Deutsches Elektronen-Synchrotron" }])
75
74
  expect(subject.titles).to eq([{ "title" => "Dynamics of colloids in molecular glass forming liquids studied via X-ray photon correlation spectroscopy" }])
76
75
  expect(subject.date).to eq("created" => "2018-01-25", "published" => "2014",
77
76
  "registered" => "2018-01-25", "updated" => "2020-09-19")
@@ -91,7 +90,7 @@ describe Commonmeta::Metadata, vcr: true do
91
90
  "affiliation" => [{ "name" => "Тверская государственная сельскохозяйственная академия" }], "familyName" => "Ганичева", "givenName" => "А.В.", "type" => "Person", "contributorRoles" => ["Author"],
92
91
  )
93
92
  expect(subject.titles.last).to eq("title" => "MODEL OF SYSTEM DYNAMICS OF PROCESS OF TRAINING",
94
- "titleType" => "TranslatedTitle")
93
+ "type" => "TranslatedTitle")
95
94
  expect(subject.date).to eq("created" => "2019-02-12", "published" => "2019",
96
95
  "registered" => "2019-02-12", "updated" => "2022-08-23")
97
96
  expect(subject.publisher).to eq("name" => "МОДЕЛИРОВАНИЕ, ОПТИМИЗАЦИЯ И ИНФОРМАЦИОННЫЕ ТЕХНОЛОГИИ")
@@ -115,10 +114,14 @@ describe Commonmeta::Metadata, vcr: true do
115
114
  expect(subject.contributors.first).to eq(
116
115
  "name" => "Europäische Kommission", "contributorRoles" => ["Author"], "type" => "Organization",
117
116
  )
118
- expect(subject.titles).to eq([
119
- { "lang" => "de",
120
- "title" => "Flash Eurobarometer 54 (Madrid Summit)" }, { "lang" => "en", "title" => "Flash Eurobarometer 54 (Madrid Summit)" }, { "titleType" => "Subtitle", "lang" => "de", "title" => "The Common European Currency" }, { "titleType" => "Subtitle", "lang" => "en", "title" => "The Common European Currency" },
121
- ])
117
+ expect(subject.titles).to eq([{ "language" => "de", "title" => "Flash Eurobarometer 54 (Madrid Summit)" },
118
+ { "language" => "en", "title" => "Flash Eurobarometer 54 (Madrid Summit)" },
119
+ { "language" => "de",
120
+ "title" => "The Common European Currency",
121
+ "type" => "Subtitle" },
122
+ { "language" => "en",
123
+ "title" => "The Common European Currency",
124
+ "type" => "Subtitle" }])
122
125
  expect(subject.subjects).to eq([{ "lang" => "en",
123
126
  "subject" => "KAT12 International Institutions, Relations, Conditions",
124
127
  "subjectScheme" => "ZA" },
@@ -155,5 +158,56 @@ describe Commonmeta::Metadata, vcr: true do
155
158
  expect(subject.license).to eq("id" => "CC-BY-4.0",
156
159
  "url" => "https://creativecommons.org/licenses/by/4.0/legalcode")
157
160
  end
161
+
162
+ it "dataset schema v4.5" do
163
+ input = "#{fixture_path}datacite-dataset_v4.5.json"
164
+ subject = described_class.new(input: input)
165
+ expect(subject.id).to eq("https://doi.org/10.82433/b09z-4k37")
166
+ expect(subject.type).to eq("Dataset")
167
+ expect(subject.contributors.length).to eq(23)
168
+ expect(subject.contributors[0]).to eq("contributorRoles" => ["Author"], "familyName" => "ExampleFamilyName", "givenName" => "ExampleGivenName", "type" => "Person")
169
+ expect(subject.contributors[2]).to eq("contributorRoles" => ["ContactPerson"], "familyName" => "ExampleFamilyName", "givenName" => "ExampleGivenName", "type" => "Person")
170
+ expect(subject.date).to eq("created" => "2022-10-27", "published" => "2022", "registered" => "2022-10-27", "updated" => "2024-01-02")
171
+ expect(subject.publisher).to eq("name" => "Example Publisher")
172
+ expect(subject.titles).to eq([{ "language" => "en", "title" => "Example Title" },
173
+ { "language" => "en", "title" => "Example Subtitle", "type" => "Subtitle" },
174
+ { "language" => "fr",
175
+ "title" => "Example TranslatedTitle",
176
+ "type" => "TranslatedTitle" },
177
+ { "language" => "en",
178
+ "title" => "Example AlternativeTitle",
179
+ "type" => "AlternativeTitle" }])
180
+ expect(subject.descriptions).to eq([{ "description" => "Example Abstract",
181
+ "type" => "Abstract",
182
+ "language" => "en" },
183
+ { "description" => "Example Methods",
184
+ "type" => "Methods",
185
+ "language" => "en" },
186
+ { "description" => "Example SeriesInformation",
187
+ "type" => "Other",
188
+ "language" => "en" },
189
+ { "description" => "Example TableOfContents",
190
+ "type" => "Other",
191
+ "language" => "en" },
192
+ { "description" => "Example TechnicalInfo",
193
+ "type" => "TechnicalInfo",
194
+ "language" => "en" },
195
+ { "description" => "Example Other", "type" => "Other", "language" => "en" }])
196
+ expect(subject.license).to eq("id" => "CC-PDDC", "url" => "https://creativecommons.org/licenses/publicdomain/")
197
+ end
198
+
199
+ it "instrument" do
200
+ input = "#{fixture_path}datacite-instrument.json"
201
+ subject = described_class.new(input: input)
202
+ puts subject.errors unless subject.valid?
203
+ expect(subject.valid?).to be true
204
+ expect(subject.id).to eq("https://doi.org/10.82433/08qf-ee96")
205
+ expect(subject.type).to eq("Instrument")
206
+ expect(subject.contributors.length).to eq(2)
207
+ expect(subject.contributors.first).to eq("contributorRoles" => ["Author"], "name" => "DECTRIS", "type" => "Organization", "id" => "https://www.wikidata.org/wiki/Q107529885")
208
+ expect(subject.date).to eq("created" => "2022-10-20", "published" => "2022", "registered" => "2022-10-20", "updated" => "2024-01-02")
209
+ expect(subject.publisher).to eq("name" => "Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences")
210
+ expect(subject.license).to be_nil
211
+ end
158
212
  end
159
213
  end
@@ -189,6 +189,31 @@ describe Commonmeta::Metadata, vcr: true do
189
189
  expect(subject.references).to be_nil
190
190
  end
191
191
 
192
+ it "medium post with institutional author" do
193
+ input = "https://api.rogue-scholar.org/posts/05f01f68-ef81-47d7-a3c1-40aba91d358f"
194
+ subject = described_class.new(input: input)
195
+ # expect(subject.valid?).to be true
196
+ expect(subject.id).to eq("https://doi.org/10.59350/jhrs4-22440")
197
+ expect(subject.url).to eq("https://medium.com/@researchgraph/unveiling-the-synergy-retrieval-augmented-generation-rag-meets-knowledge-graphs-fc0a6900f7eb")
198
+ expect(subject.alternate_identifiers).to eq([{ "alternateIdentifier" => "05f01f68-ef81-47d7-a3c1-40aba91d358f", "alternateIdentifierType" => "UUID" }])
199
+ expect(subject.type).to eq("Article")
200
+ expect(subject.contributors.length).to eq(1)
201
+ expect(subject.contributors.first).to eq("contributorRoles"=>["Author"], "name"=>"Research Graph", "type"=>"Organization")
202
+ expect(subject.titles).to eq([{ "title" => "Unveiling the Synergy: Retrieval Augmented Generation (RAG) Meets Knowledge Graphs" }])
203
+ expect(subject.license).to eq("id" => "CC-BY-4.0",
204
+ "url" => "https://creativecommons.org/licenses/by/4.0/legalcode")
205
+ expect(subject.date).to eq("published"=>"2024-01-18", "updated"=>"2024-01-18")
206
+ expect(subject.descriptions.first["description"]).to start_with("<strong> Tools and Platform for Integration of Knowledge Graph with RAG pipelines.")
207
+ expect(subject.publisher).to eq("name" => "Research Graph")
208
+ expect(subject.subjects).to eq([{ "subject" => "Computer and information sciences" },
209
+ { "schemeUri" => "http://www.oecd.org/science/inno/38235147.pdf",
210
+ "subject" => "FOS: Computer and information sciences",
211
+ "subjectScheme" => "Fields of Science and Technology (FOS)" }])
212
+ expect(subject.language).to eq("en")
213
+ expect(subject.container).to eq("identifier" => "https://medium.com/@researchgraph", "identifierType" => "URL", "title" => "Research Graph", "type" => "Periodical")
214
+ expect(subject.references).to be_nil
215
+ end
216
+
192
217
  it "syldavia gazette post with references" do
193
218
  input = "https://api.rogue-scholar.org/posts/0022b9ef-525a-4a79-81ad-13411697f58a"
194
219
  subject = described_class.new(input: input)
data/spec/utils_spec.rb CHANGED
@@ -502,7 +502,7 @@ describe Commonmeta::Metadata do
502
502
  links = [{ "rel" => "self", "type" => "application/atom+xml", "href" => "https://syldavia-gazette.org/atom/" },
503
503
  { "rel" => "alternate", "type" => "text/html", "href" => "https://syldavia-gazette.org" },
504
504
  { "rel" => "license", "type" => "text/html", "href" => "https://creativecommons.org/licenses/by/4.0/legalcode" }]
505
-
505
+
506
506
  it "url" do
507
507
  response = subject.get_link(links, "self")
508
508
  expect(response).to eq("https://syldavia-gazette.org/atom/")
@@ -721,7 +721,7 @@ describe Commonmeta::Metadata do
721
721
  it "decode doi to uuid" do
722
722
  doi = "https://doi.org/10.53731/6315bn4-aqg82ja-4a9wxdt-29f7279"
723
723
  response = subject.decode_doi(doi, uuid: true)
724
- expect(response).to eq('255d48ab-c102-9288-a4f3-add092f388e9')
724
+ expect(response).to eq("255d48ab-c102-9288-a4f3-add092f388e9")
725
725
  end
726
726
  end
727
727
 
@@ -745,10 +745,36 @@ describe Commonmeta::Metadata do
745
745
  end
746
746
  end
747
747
 
748
- context 'json_feed_unregistered_url' do
749
- it 'all posts' do
748
+ context "json_feed_unregistered_url" do
749
+ it "all posts" do
750
750
  response = subject.json_feed_unregistered_url
751
751
  expect(response).to eq("https://api.rogue-scholar.org/posts/unregistered")
752
752
  end
753
753
  end
754
+
755
+ context "normalize_name_identifier" do
756
+ it "ORCID" do
757
+ hsh = {"schemeUri"=>"https://orcid.org", "nameIdentifier"=>"https://orcid.org/0000-0003-1419-2405", "nameIdentifierScheme"=>"ORCID"}
758
+ response = subject.normalize_name_identifier(hsh)
759
+ expect(response).to eq("https://orcid.org/0000-0003-1419-2405")
760
+ end
761
+
762
+ it "ROR" do
763
+ hsh = { "schemeUri" => "https://ror.org", "nameIdentifier" => "https://ror.org/02aj13c28", "nameIdentifierScheme" => "ROR" }
764
+ response = subject.normalize_name_identifier(hsh)
765
+ expect(response).to eq("https://ror.org/02aj13c28")
766
+ end
767
+
768
+ it "ISNI" do
769
+ hsh = { "schemeUri" => "http://isni.org/isni/", "nameIdentifier" => "0000000134596520", "nameIdentifierScheme" => "ISNI" }
770
+ response = subject.normalize_name_identifier(hsh)
771
+ expect(response).to eq("https://isni.org/isni/0000000134596520")
772
+ end
773
+
774
+ it "Wikidata" do
775
+ hsh = {"schemeUri"=>"https://www.wikidata.org/wiki/", "nameIdentifier"=>"Q107529885", "nameIdentifierScheme"=>"Wikidata"}
776
+ response = subject.normalize_name_identifier(hsh)
777
+ expect(response).to eq("https://www.wikidata.org/wiki/Q107529885")
778
+ end
779
+ end
754
780
  end
@@ -33,11 +33,38 @@ describe Commonmeta::Metadata, vcr: true do
33
33
  "volume" => "426",
34
34
  "firstPage" => "181",
35
35
  "containerTitle" => "Nature")
36
- expect(json["date"]).to eq("published"=>"2014-02-11", "updated"=>"2022-03-26")
36
+ expect(json["date"]).to eq("published" => "2014-02-11", "updated" => "2022-03-26")
37
37
  expect(json["descriptions"].first["description"]).to start_with("Among various advantages,")
38
- expect(json["license"]).to eq("id"=>"CC-BY-3.0", "url"=>"https://creativecommons.org/licenses/by/3.0/legalcode")
38
+ expect(json["license"]).to eq("id" => "CC-BY-3.0", "url" => "https://creativecommons.org/licenses/by/3.0/legalcode")
39
39
  expect(json["provider"]).to eq("Crossref")
40
- expect(json["files"].first).to eq("mimeType"=>"application/pdf", "url"=>"https://cdn.elifesciences.org/articles/01567/elife-01567-v1.pdf")
40
+ expect(json["files"].first).to eq("mimeType" => "application/pdf", "url" => "https://cdn.elifesciences.org/articles/01567/elife-01567-v1.pdf")
41
+ end
42
+
43
+ it "dataset schema v4.5" do
44
+ input = "#{fixture_path}datacite-dataset_v4.5.json"
45
+ subject = described_class.new(input: input)
46
+ expect(subject.id).to eq("https://doi.org/10.82433/b09z-4k37")
47
+ json = JSON.parse(subject.commonmeta)
48
+ expect(json["id"]).to eq("https://doi.org/10.82433/b09z-4k37")
49
+ expect(json["type"]).to eq("Dataset")
50
+ expect(json["titles"]).to eq([{ "language" => "en", "title" => "Example Title" },
51
+ { "language" => "en", "title" => "Example Subtitle", "type" => "Subtitle" },
52
+ { "language" => "fr",
53
+ "title" => "Example TranslatedTitle",
54
+ "type" => "TranslatedTitle" },
55
+ { "language" => "en",
56
+ "title" => "Example AlternativeTitle",
57
+ "type" => "AlternativeTitle" }])
58
+ expect(json["descriptions"]).to eq([{ "description" => "Example Abstract", "language" => "en", "type" => "Abstract" },
59
+ { "description" => "Example Methods", "language" => "en", "type" => "Methods" },
60
+ { "description" => "Example SeriesInformation",
61
+ "language" => "en",
62
+ "type" => "Other" },
63
+ { "description" => "Example TableOfContents", "language" => "en", "type" => "Other" },
64
+ { "description" => "Example TechnicalInfo",
65
+ "language" => "en",
66
+ "type" => "TechnicalInfo" },
67
+ { "description" => "Example Other", "language" => "en", "type" => "Other" }])
41
68
  end
42
69
  end
43
70
  end
@@ -7,6 +7,7 @@ describe Commonmeta::Metadata, vcr: true do
7
7
  it 'Dataset' do
8
8
  input = 'https://doi.org/10.5061/DRYAD.8515'
9
9
  subject = described_class.new(input: input, from: 'datacite')
10
+ puts subject.errors unless subject.valid?
10
11
  expect(subject.valid?).to be true
11
12
  json = JSON.parse(subject.csl)
12
13
  expect(json['type']).to eq('dataset')
@@ -37,6 +37,7 @@ describe Commonmeta::Metadata, vcr: true do
37
37
  it 'text' do
38
38
  input = 'https://doi.org/10.3204/desy-2014-01645'
39
39
  subject = described_class.new(input: input, from: 'datacite')
40
+ puts subject.errors unless subject.valid?
40
41
  expect(subject.valid?).to be true
41
42
  csv = subject.csv.parse_csv
42
43
 
@@ -136,7 +136,6 @@ describe Commonmeta::Metadata, vcr: true do
136
136
  it 'from schema.org' do
137
137
  input = 'https://blog.front-matter.io/posts/eating-your-own-dog-food/'
138
138
  subject = described_class.new(input: input, from: 'schema_org')
139
- puts subject.errors
140
139
  expect(subject.valid?).to be true
141
140
  datacite = JSON.parse(subject.datacite)
142
141
  expect(datacite.fetch('titles')).to eq([{ 'title' => 'Eating your own Dog Food' }])