commonmeta-ruby 3.9.0 → 3.12.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (32) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +12 -11
  3. data/lib/commonmeta/author_utils.rb +12 -5
  4. data/lib/commonmeta/readers/commonmeta_reader.rb +1 -1
  5. data/lib/commonmeta/readers/datacite_reader.rb +120 -108
  6. data/lib/commonmeta/schema_utils.rb +1 -1
  7. data/lib/commonmeta/utils.rb +47 -2
  8. data/lib/commonmeta/version.rb +1 -1
  9. data/lib/commonmeta/writers/commonmeta_writer.rb +1 -1
  10. data/resources/{commonmeta_v0.10.5.json → commonmeta_v0.10.7.json} +21 -5
  11. data/resources/{datacite-v4.json → datacite-v45.json} +26 -5
  12. data/resources/kernel-4/include/datacite-relationType-v4.xsd +2 -0
  13. data/resources/kernel-4/include/datacite-resourceType-v4.xsd +2 -0
  14. data/resources/kernel-4/metadata.xsd +11 -7
  15. data/spec/author_utils_spec.rb +10 -0
  16. data/spec/fixtures/commonmeta.json +1 -1
  17. data/spec/fixtures/datacite-dataset_v4.5.json +736 -0
  18. data/spec/fixtures/datacite-instrument.json +135 -0
  19. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_datacite_metadata/SoftwareSourceCode.yml +8 -8
  20. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_datacite_metadata/dissertation.yml +12 -12
  21. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_datacite_metadata/funding_references.yml +12 -12
  22. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_datacite_metadata/subject_scheme.yml +22 -22
  23. data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/medium_post_with_institutional_author.yml +317 -0
  24. data/spec/readers/commonmeta_reader_spec.rb +1 -1
  25. data/spec/readers/datacite_reader_spec.rb +68 -14
  26. data/spec/readers/json_feed_reader_spec.rb +25 -0
  27. data/spec/utils_spec.rb +30 -4
  28. data/spec/writers/commonmeta_writer_spec.rb +30 -3
  29. data/spec/writers/csl_writer_spec.rb +1 -0
  30. data/spec/writers/csv_writer_spec.rb +1 -0
  31. data/spec/writers/datacite_writer_spec.rb +0 -1
  32. metadata +7 -4
@@ -0,0 +1,317 @@
1
+ ---
2
+ http_interactions:
3
+ - request:
4
+ method: get
5
+ uri: https://api.rogue-scholar.org/posts/05f01f68-ef81-47d7-a3c1-40aba91d358f
6
+ body:
7
+ encoding: ASCII-8BIT
8
+ string: ''
9
+ headers:
10
+ Connection:
11
+ - close
12
+ Host:
13
+ - api.rogue-scholar.org
14
+ User-Agent:
15
+ - http.rb/5.1.1
16
+ response:
17
+ status:
18
+ code: 200
19
+ message: OK
20
+ headers:
21
+ Content-Type:
22
+ - application/json
23
+ Content-Length:
24
+ - '23886'
25
+ Ratelimit-Limit:
26
+ - '15'
27
+ Ratelimit-Remaining:
28
+ - '14'
29
+ Ratelimit-Reset:
30
+ - '3'
31
+ Date:
32
+ - Wed, 31 Jan 2024 19:50:01 GMT
33
+ Server:
34
+ - Fly/ba9e227a (2024-01-26)
35
+ Via:
36
+ - 1.1 fly.io
37
+ Fly-Request-Id:
38
+ - 01HNGH4EZV3XQF20H1PZ6X5N07-fra
39
+ body:
40
+ encoding: UTF-8
41
+ string: '{"abstract":null,"archive_url":null,"authors":[{"name":"Research Graph"}],"blog":{"api":false,"archive_prefix":null,"authors":null,"backlog":0,"canonical_url":null,"category":"computerAndInformationSciences","created_at":1706685423,"current_feed_url":null,"description":"Stories
42
+ by Research Graph on Medium","favicon":"https://cdn-images-1.medium.com/fit/c/150/150/1*laJi0jBkVoGhXid7gD_DmQ.png","feed_format":"application/rss+xml","feed_url":"https://medium.com/@researchgraph/feed","filter":null,"funding":null,"generator":"Medium","generator_raw":"Medium","home_page_url":"https://medium.com/@researchgraph","id":"30da2ca9-8258-4ab5-acca-3919d9a5d98d","indexed":true,"issn":null,"language":"en","license":"https://creativecommons.org/licenses/by/4.0/legalcode","mastodon":"","plan":"Starter","prefix":"10.59350","relative_url":null,"ror":null,"secure":true,"slug":"researchgraph","status":"active","title":"Research
43
+ Graph","updated_at":1706151454,"use_api":null,"use_mastodon":false,"user_id":"a7e16958-1175-437c-b839-d4b8a47ec811","version":"https://jsonfeed.org/version/1.1"},"blog_name":"Research
44
+ Graph","blog_slug":"researchgraph","content_text":"**Tools and Platform for
45
+ Integration of Knowledge Graph with RAG\npipelines.**\n\n<figure>\n<img\nsrc=\"https://cdn-images-1.medium.com/max/1024/1*bJ3eWZ7301vYDzBomwdLfQ.png\"\nalt=\"Complex
46
+ network connected to books and showing information from magespace\" />\n<figcaption>Image
47
+ Created in <a\nhref=\"https://www.mage.space/\">https://www.mage.space/</a></figcaption>\n</figure>\n\nAuthors:
48
+ [Aland\nAstudillo](https://www.linkedin.com/in/aland-astudillo/), [Aishwarya\nNambissan](https://www.linkedin.com/in/aishwarya-nambissan-127229200/)\n\nMany
49
+ users of chatbots such as ChatGPT, have encountered the problem of\nreceiving
50
+ inappropriate or incompatible responses. There are several\nreasons why this
51
+ might\u00a0happen.\n\nOne reason is the lack of appropriate training data,
52
+ as chatbots are\nusually trained on large amounts of text and code. If the
53
+ data is\ninsufficient or of poor quality, the chatbot may misunderstand queries\nand
54
+ provide inaccurate responses. Another reason is that some chatbots\nare designed
55
+ for specific tasks or domains, which limits their ability\nto handle broader
56
+ queries or understand subtle nuances in conversation.\nAdditionally, chatbots
57
+ may struggle with natural language, which is\ncomplex and often ambiguous.
58
+ This can cause them to misunderstand a\nuser''s query and provide irrelevant
59
+ or off-topic responses. Finally,\nthere are technical limitations, such as
60
+ the chatbot''s inability to\nreason or make inferences.\n\nThis article explores
61
+ a potential solution by combining two influential\napproaches in the field
62
+ of Natural Language Processing\u200a---\u200aRetrieval\nAugmented Generation
63
+ (**RAG**) and Knowledge Graphs(**KGs**). We will\ndelve into the partnership
64
+ between these two entities, discuss the\nnotable technologies and software
65
+ used in their processes, and highlight\nvarious options for utilizing their
66
+ combined potential.\n\n### **RAG**\n\nRetrieval-Augmented Generation is the
67
+ process of optimizing the output\nof a large language model using a knowledge
68
+ base outside of its training\ndata sources before generating a response. It
69
+ takes an input and\nretrieves a set of relevant/supporting documents given
70
+ a source (e.g.,\nWikipedia). This can be thought of as a Large Language Model
71
+ (LLM) not\njust putting words together, but carefully selecting relevant\ninformation
72
+ from external sources and Knowledge Graphs to create\nwell-informed and detailed
73
+ responses.\n\n### RAG Retrieval Techniques\n\nThe following are some crucial
74
+ technologies that enable RAG''s impressive\nability to retrieve and incorporate
75
+ relevant information:\n\n**Vector Search**: It transforms text into numerical
76
+ vectors, capturing\ntheir meaning and nuances in a mathematical space, creating
77
+ a map of\nrelationships. Similar texts, like those discussing shared topics
78
+ or\nusing similar language, end up positioned close together in this space,\nallowing
79
+ vector search to quickly identify them as related. This allows\nlightning-fast
80
+ comparisons, finding similar texts based on meaning, not\njust keywords.\n\nAlgorithms
81
+ like [**Faiss**](https://github.com/facebookresearch/faiss)\nand [**Annoy**](https://github.com/spotify/annoy)
82
+ map text into dense\nvectors, enabling fast comparisons and retrieval of relevant
83
+ passages\nbased on semantic similarity.\n\n**Passage Ranking**: It is an internal
84
+ algorithm that scores candidate\ntext passages based on their relevance to
85
+ a query. It considers factors\nlike keyword frequency, keyword overlap, and
86
+ document structure to act\nlike a judge, sifting through information to select
87
+ the most fitting and\ninformative passages.\n\nKeyword overlap measures how
88
+ often the same keywords appear in **both**\nthe query and the candidate passage,
89
+ emphasizing shared vocabulary and\npotential relevance. It differs from keyword
90
+ frequency, which simply\ncounts how often individual keywords appear within
91
+ a passage, regardless\nof their presence in the\u00a0query.\n\nTechniques
92
+ like [**BM25**](https://github.com/getalp/wikIR) and\n[**TF-IDF**](https://github.com/marcocor/wikipedia-idf)
93
+ score candidate\npassages based on keyword overlap and frequency, ensuring
94
+ retrieved\ninformation truly fits the\u00a0context.\n\n**Graph Neural Networks**
95
+ (**GNNs**): They are neural networks designed\nto explore and learn from interconnected
96
+ data like maps, social\nnetworks, and other complex relationships. Unlike
97
+ traditional processing\nmethods that go through data in a linear fashion,
98
+ GNNs are capable of\nrecognizing hidden patterns and understanding relationships
99
+ like \"who\nknows who\" and \"what connects to what\" by \"hopping\" across
100
+ connections\nin\u00a0data.\n\nConsider a graph as a network of dots(nodes)
101
+ connected by lines (edges).\nEach dot represents some information, like a
102
+ person, object, or concept.\nThe lines tell you how these things relate to
103
+ each\u00a0other.\n\nGNNs work in rounds. In each\u00a0round:\n\n1. Message
104
+ Passing: Each node \"talks\" to its neighbors, sending\n messages along
105
+ the edges. These messages contain information about\n the node itself and
106
+ its features.\n2. Node Update: Each node receives messages from all its neighbors
107
+ and\n combines them with its own information. This update can involve\n calculations
108
+ and applying a special function.\n3. Output Calculation: Based on the updated
109
+ information, the network\n calculates an output for each node. This output
110
+ could be a\n prediction about the node''s category, its relationship to
111
+ another\n node, or some other relevant information.\n\nThis process repeats
112
+ for multiple rounds, allowing nodes to incorporate\ninformation from their
113
+ entire neighborhood, not just their direct\nneighbors. As the rounds progress,
114
+ the network learns to understand the\nrelationships between nodes and the
115
+ overall structure of the\u00a0graph.\n\nWhen dealing with Knowledge Graphs,
116
+ frameworks like\n[**PyTorch-Geometric**](https://readthedocs.org/projects/pytorch-geometric/)\nand
117
+ [**DeepMind''s\nGNN**](https://github.com/deepmind/deepmind-research/blob/master/learning_to_simulate/graph_network.py)\nlibrary
118
+ come into play. These frameworks allow GNNs to traverse\ninterconnected entities
119
+ and relationships within the graph, retrieve\nrelevant knowledge fragments,
120
+ and understand complex connections.\n\n### **Knowledge Graphs: The Structured
121
+ Wisdom\u00a0Library**\n\nA knowledge graph, also referred to as a semantic
122
+ network, is a\nstructure that represents a network of real-world entities
123
+ such as\nobjects, events, situations, or concepts. It helps to illustrate
124
+ the\nconstantly changing representations of the world, connecting entities\n(such
125
+ as \"Marie Curie\") and relationships (such as \"won Nobel Prize\") to\nform
126
+ a complex network of information. This information is typically\nstored in
127
+ a graph database and visualized as a graph structure, thus the\nterm knowledge
128
+ \"graph\".\n\nKGs go beyond simply finding relevant facts and delve deeper
129
+ into\nunderstanding the relationships and insights hidden within using these\nprocesses:\n\n**Entity
130
+ Linking**: Imagine a vast network of information, like a big\npuzzle of dots.
131
+ Now imagine trying to connect specific names, places,\nand concepts to their
132
+ corresponding dots in the puzzle. That is what\nentity linking does with text
133
+ and knowledge graphs, connecting the\nspecific components of the text to the
134
+ corresponding nodes in the graph.\nThey help systems understand the exact
135
+ meaning of entities, and find\nrelevant information from the\u00a0graph.\n\nLibraries
136
+ like [**DGL-KeLP**](https://github.com/awslabs/dgl-ke)\nleverage GNNs to identify
137
+ and link named entities (like \"Marie Curie\")\nto their respective nodes
138
+ within the Knowledge Graphs, enabling RAG to\nretrieve information that is
139
+ directly relevant to the core subject of a\nsearch\u00a0query\n\n**Path Mining**:
140
+ Path mining is a process of uncovering hidden\nrelationships and patterns
141
+ that are not easily noticeable. It involves\nexploring complicated networks
142
+ of information and identifying and\ntracing connections between entities that
143
+ may seem unrelated. By doing\nso, path mining reveals surprising insights
144
+ and useful knowledge,\nimproving our understanding of the complex structures
145
+ within knowledge\ngraphs.\n\nTools like [**Neo4j**](https://neo4j.com/) and\n[**Stanza**](https://github.com/stanfordnlp/stanza)
146
+ allow traversing\npaths between entities, uncovering hidden relationships,
147
+ and generating\ninsightful responses based on this deeper understanding.\n\n**Reasoning
148
+ and Inference**: In the context of knowledge graphs,\nreasoning and inference
149
+ are not just limited to discovering facts; they\nare also concerned with utilizing
150
+ them effectively. This involves\nintegrating data, drawing meaningful connections,
151
+ and using logical\nreasoning to resolve issues, foresee future occurrences,
152
+ or even\nconstruct narratives leveraging the insights provided by the knowledge\ngraph.\n\nConsider
153
+ the scenario of trying to find an organization that works in\nspecific sectors
154
+ with the help of a knowledge graph. This analogy\neffectively highlights the
155
+ active role of reasoning and inference in\nknowledge graphs:\n\n1. Gathering
156
+ Facts: Knowledge graphs collect and organize information\n from various
157
+ sources, such as websites, databases, academic papers,\n and social media
158
+ platforms. These facts are represented as\n structured data, with entities
159
+ (e.g., organizations) and their\n attributes (e.g., sectors in which they
160
+ operate) forming nodes and\n edges in the graph. By combining data about
161
+ organizations and\n sectors, knowledge graphs enable the gathering of relevant
162
+ facts for\n analysis.\n2. Integrating information: By connecting an organization''s\n relationships
163
+ with specific sectors, such as partnerships,\n investments, or certifications,
164
+ knowledge graphs reveal the scope\n and relevance of their work within
165
+ those sectors. Links to related\n entities like employees, board members,
166
+ or projects can further\n contribute to understanding an organization''s
167
+ involvement in\n specific\u00a0sectors.\n3. Predicting and Creating: Knowledge
168
+ graphs can leverage machine\n learning and predictive models to infer missing
169
+ or hidden\n information. By analyzing the available facts and connections
170
+ within\n the graph, these models can predict an organization''s potential\n involvement
171
+ in sectors that have common attributes with their known\n areas of operation.
172
+ For example, if an organization has expertise in\n renewable energy, predictive
173
+ models could suggest their likely\n involvement in related sectors like
174
+ clean transportation or\n sustainable infrastructure. Additionally, knowledge
175
+ graphs\n facilitate the creation of new information and insights by combining\n existing
176
+ facts with external data sources. For instance, by\n integrating real-time
177
+ data on industry trends, market analysis, or\n news articles, knowledge
178
+ graphs enable the discovery of emerging\n sectors or upcoming organizations
179
+ that might align with the given\n parameters.\n\nA framework like [**Atomspace**](https://github.com/opencog/atomspace)\nfrom
180
+ [**OpenCog**](https://opencog.org/) empowers RAG to reason and\ninfer new
181
+ knowledge. By traversing paths and combining information from\ninterconnected
182
+ entities, the system can generate informed predictions or\nanswer hypothetical
183
+ questions.\n\n### Purpose\n\nThe combination of Retrieval-Augmented Generation
184
+ (RAG) and Knowledge\nGraphs (KG) is beneficial for several\u00a0reasons:\n\n1. **Enhanced
185
+ information retrieval**: Knowledge graphs provide\n structured and interconnected
186
+ information that can significantly\n improve the effectiveness of information
187
+ retrieval. By using KGs,\n RAG models can retrieve more accurate and relevant
188
+ information,\n leading to better generation and response\u00a0quality.\n2. **Reliable
189
+ and diverse information:** KGs are constructed from\n authoritative sources,
190
+ making them reliable and trustworthy sources\n of information. RAG models
191
+ can leverage this reliable information to\n generate more accurate responses.
192
+ Additionally, KGs help in\n diversifying the generated responses by providing
193
+ a broader pool of\n related facts and entities.\n3. **Context-aware understanding**:
194
+ KGs enable RAG models to understand\n and reason over the contextual information.
195
+ By leveraging the\n relationships and semantic connections encoded in KGs,
196
+ RAG models\n can better grasp the context of user queries or conversations,\n resulting
197
+ in more coherent and appropriate responses.\n4. **Handling complex queries**:
198
+ KGs allow RAG models to tackle complex\n queries by breaking them down
199
+ into smaller sub-queries, retrieving\n relevant pieces of information from
200
+ the KG, and then generating a\n response based on the retrieved knowledge.
201
+ This enables RAG models\n to handle a wide range of user queries effectively.\n5. **Explainability
202
+ and transparency**: KGs provide a transparent and\n interpretable representation
203
+ of knowledge. By integrating KG-based\n retrieval into RAG models, the
204
+ reasoning behind the generated\n responses becomes more explainable. Users
205
+ can have a clear\n understanding of the knowledge sources and connections
206
+ used to\n produce the response.\n6. **Scalability**: Knowledge graphs
207
+ act as large-scale repositories of\n information. RAG models can leverage
208
+ KGs to generate responses to\n various queries or conversations without
209
+ requiring additional\n supervised training data. This makes the RAG+KG
210
+ approach scalable to\n handle an extensive range of knowledge domains and
211
+ user\u00a0queries.\n\n### **Pipeline Possibilities: Orchestrating RAG and\u00a0KGs:**\n\nLet''s
212
+ explore some exciting pipeline options for harnessing the combined\npower
213
+ of RAG and Knowledge Graphs. There are two options in which either\nthe LLM
214
+ is prioritized or the Knowledge Graph is prioritized:\n\n**Option 1: LLM-Centric
215
+ Pipeline:**\n\nThe LLM-Centric pipeline is a RAG and Knowledge Graph combination
216
+ that\nempowers LLMs to craft well-informed responses. Here''s how it\u00a0works:\n\n1. Start
217
+ with the user''s question or statement\n2. The LLM (like GPT-3) generates
218
+ an initial draft response based on\n its internal knowledge. This draft
219
+ may lack specific factual details\n or nuances that a knowledge graph can\u00a0provide.\n3. RAG
220
+ kicks in, searching the text corpus or the Knowledge Graph for\n relevant
221
+ passages that enrich the draft. During the retrieval\n process, RAG retrieval
222
+ techniques are used to search not only text\n corpora but also knowledge
223
+ graphs to find relevant information. This\n means that RAG can directly
224
+ tap into the structured knowledge within\n the graph to retrieve facts,
225
+ relationships, and entities that align\n with the user''s query and the
226
+ LLM''s generated draft.\n4. The retrieved information is carefully fused
227
+ with the LLM''s output,\n creating a more factually accurate and insightful
228
+ response\n5. A final polishing step ensures the response is fluent, grammatically\n correct,
229
+ and ready to\u00a0show.\n\n<figure>\n<img\nsrc=\"https://cdn-images-1.medium.com/max/1024/0*3pd9MOIflkbS07wI\"
230
+ />\n<figcaption>RAG LLM-centric generic\u00a0scheme.</figcaption>\n</figure>\n\nThe
231
+ basic steps to perform this\u00a0are:\n\n1. **Pre-processing**: Clean and
232
+ tokenize user input to prepare for\n processing.\n2. **LLM Generation**:
233
+ Generate an initial draft response using an LLM\n like [**GPT-3**](https://openai.com/product)
234
+ or [**Jurassic-1\n Jumbo**](https://www.livescience.com/google-sentient-ai-lamda-lemoine).\n3. **Retrieval**:
235
+ Employ RAG techniques to retrieve relevant passages\n from a text corpus
236
+ or Knowledge Graphs.\n4. **Fusion**: Integrate retrieved information into
237
+ the LLM-generated\n draft, creating a more informed and factually-grounded
238
+ response.\n5. **Post-processing**: Refine the final response for fluency,\n grammatical
239
+ correctness, and overall coherence.\n\n**Option 2: Knowledge Graphs-Centric
240
+ Pipeline:**\n\nIn this approach, knowledge graphs take center stage. In essence,
241
+ this\npipeline prioritizes the structured knowledge within knowledge graphs,\nusing
242
+ RAG retrieval techniques to translate those insights into\ncompelling and
243
+ informative language. Here''s how it\u00a0unfolds:\n\n1. User input: The
244
+ process begins with the user''s question or statement\n2. Graph exploration:
245
+ The knowledge graph is meticulously explored to\n identify relevant entities,
246
+ relationships, and paths that align with\n the user''s input. This stage
247
+ involves techniques like entity\n linking, path mining, and reasoning to
248
+ uncover valuable information\n within the\u00a0graph\n3. Response planning:
249
+ The insights extracted from the graph are used to\n create a structured
250
+ response plan. This plan outlines the key\n points, facts, and logical
251
+ flow that the final response\n should\u00a0embody\n4. Language generation:
252
+ This is where RAG steps in. Its purpose is to\n create human-like text
253
+ that follows the response plan. It uses LLMs\n to produce well-written
254
+ sentences and paragraphs, combining the\n relevant information from the
255
+ knowledge graph while maintaining\n cohesiveness and readability.\n5. Post-processing:
256
+ The generated response undergoes a final refinement\n process to ensure
257
+ grammatical correctness, clarity, and\n overall\u00a0quality\n\n<figure>\n<img\nsrc=\"https://cdn-images-1.medium.com/max/1024/0*mZ83esKBjbPmCq_C\"
258
+ />\n<figcaption>RAG Knowledge Graph-centric generic\u00a0scheme.</figcaption>\n</figure>\n\nThe
259
+ basic steps\u00a0are:\n\n1. **Query Formulation**: Transform the user input
260
+ into a query\n suitable for Knowledge Graph''s exploration.\n2. **Knowledge
261
+ Graphs:** You can use either Neo4j or\n [NebulaGraph](https://www.nebula-graph.io/)
262
+ to implement a retrieval\n enhancement technique. This technique involves
263
+ utilizing a knowledge\n graph to illustrate the connections between entities
264
+ and\n relationships. Additionally, it incorporates a powerful language\n model
265
+ to improve the retrieval process.\n3. **Fact Selection**: Employ entity linking
266
+ and reasoning algorithms\n to select and prioritize the most relevant facts
267
+ based on the query\n and\u00a0context.\n4. **Natural Language Generation**
268
+ (**NLG**): Utilise specialized NLG\n models like\n [BART](https://research.facebook.com/publications/controllable-abstractive-summarization/)\n to
269
+ translate the extracted facts into a natural language response.\n5. **Refinement**:
270
+ Enhance the generated response for clarity and\n coherence.\n\n### **Unveiling
271
+ a Future of Intelligent Interaction**\n\nThe combination of RAG and Knowledge
272
+ Graphs goes beyond just being a\ntechnological fusion. It paves the way for
273
+ a future where the\ninteraction between humans and computers goes beyond simple
274
+ words and\nbecomes a more informed and refined form of communication. As these\ntechnologies
275
+ continue to develop, we can expect to witness a significant\ntransformation
276
+ in:\n\n- AI-powered assistants that answer your questions with the confidence\n of
277
+ a well-read friend, seamlessly combining relevant facts and\n insights gleaned
278
+ from Knowledge Graphs.\n- Next-generation search engines that go beyond keyword
279
+ matching,\n understanding the deeper meaning behind your queries and delivering\n comprehensive,
280
+ contextual results enriched with information from\n Knowledge Graphs.\n-
281
+ Creative writing tools that utilize RAG and Knowledge Graphs to\n generate
282
+ stories that are both factually accurate and full of\n unexpected plot twists
283
+ and character development, moving beyond\n clich\u00e9d patterns.\n\n###
284
+ **Conclusion**\n\nThe convergence of Retrieval Augmented Generation (RAG)
285
+ and Knowledge\nGraphs (KGs) brings about an exciting synergy in the world
286
+ of Natural\nLanguage Processing (NLP). RAG enhances the output of large language\nmodels
287
+ by carefully selecting relevant information from external sources\nand KGs,
288
+ allowing for well-informed and detailed responses. KGs, on the\nother hand,
289
+ provide a structured representation of real-world entities\nand their relationships,
290
+ enabling the exploration of hidden insights and\nthe discovery of complex
291
+ connections.\n\nThe integration of RAG and KGs opens up two pipeline possibilities.
292
+ The\nLLM-centric pipeline prioritizes the language model''s output, which
293
+ is\nthen enriched with information retrieved from KGs. The Knowledge\nGraphs-centric
294
+ pipeline, on the other hand, places KGs at the center,\nutilizing RAG techniques
295
+ to translate the structured insights into\ncompelling and informative language.\n\nWhile
296
+ integrating LLMs and a knowledge graph for content retrieval\nrequires careful
297
+ planning, the reward is significant. You can gain\naccess to hidden relationships
298
+ within information, ultimately leading to\nhigher-quality output information.\n\nTools
299
+ like **OpenAI**, **Langchain**, and **LlamaIndex** provide\nready-made pipelines
300
+ to integrate knowledge graphs (like **Neo4j**)\neasily. Meanwhile, open-source
301
+ LLMs like **Mistral**, **Llama**, and\n**Dolphin** are catching up to proprietary
302
+ models in performance, making\nthem attractive choices for building custom
303
+ architectures. This\nopen-source scenario allows for the exploration and examination
304
+ of\nvarious methods before fully committing to a particular technological\nframework.
305
+ So, it is crucial to evaluate your needs and choose the\napproach that best
306
+ fits your use\u00a0case.\n\n![](https://medium.com/_/stat?event=post.clientViewed&referrerSource=full_rss&postId=fc0a6900f7eb){width=\"1\"\nheight=\"1\"}\n","doi":"https://doi.org/10.59350/jhrs4-22440","guid":"https://medium.com/p/fc0a6900f7eb","id":"05f01f68-ef81-47d7-a3c1-40aba91d358f","image":"https://cdn-images-1.medium.com/max/1024/1*bJ3eWZ7301vYDzBomwdLfQ.png","indexed_at":1706690571,"language":"en","published_at":1705557796,"reference":[],"relationships":[],"summary":"<strong>\n
307
+ Tools and Platform for Integration of Knowledge Graph with RAG pipelines.\n</strong>\nAuthors:
308
+ Aland Astudillo, Aishwarya Nambissan Many users of chatbots such as ChatGPT,
309
+ have encountered the problem of receiving inappropriate or incompatible responses.
310
+ There are several reasons why this might\u00a0happen. One reason is the lack
311
+ of appropriate training data, as chatbots are usually trained on large amounts
312
+ of text and code.","tags":["Artificial-intelligence","Machine-learning","Retrieval-augmented","Knowledge-graph"],"title":"Unveiling
313
+ the Synergy: Retrieval Augmented Generation (RAG) Meets Knowledge Graphs","updated_at":1705557796,"url":"https://medium.com/@researchgraph/unveiling-the-synergy-retrieval-augmented-generation-rag-meets-knowledge-graphs-fc0a6900f7eb"}
314
+
315
+ '
316
+ recorded_at: Wed, 31 Jan 2024 19:50:01 GMT
317
+ recorded_with: VCR 6.2.0
@@ -10,7 +10,7 @@ describe Commonmeta::Metadata, vcr: true do
10
10
  context 'read commonmeta metadata' do
11
11
  it "default" do
12
12
  expect(subject.valid?).to be true
13
- expect(subject.schema_version).to eq("https://commonmeta.org/commonmeta_v0.10.5.json")
13
+ expect(subject.schema_version).to eq("https://commonmeta.org/commonmeta_v0.10")
14
14
  expect(subject.id).to eq("https://doi.org/10.7554/elife.01567")
15
15
  expect(subject.type).to eq("JournalArticle")
16
16
  expect(subject.url).to eq("https://elifesciences.org/articles/01567")
@@ -36,13 +36,12 @@ describe Commonmeta::Metadata, vcr: true do
36
36
  # expect(subject.valid?).to be true
37
37
  expect(subject.id).to eq("https://doi.org/10.5063/f1m61h5x")
38
38
  expect(subject.type).to eq("Software")
39
- expect(subject.contributors).to eq([{"contributorRoles"=>["Author"],
40
- "name"=>
41
- "Jones, Matthew B.; Slaughter, Peter; Nahf, Rob; Boettiger, Carl ; Jones, Chris; Read, Jordan; Walker, Lauren; Hart, Edmund; Chamberlain, Scott",
42
- "type"=>"Organization"}])
43
- expect(subject.titles).to eq([{"title"=>"dataone: R interface to the DataONE network of data repositories"}])
44
- expect(subject.date).to eq("created"=>"2016-03-12", "published"=>"2016", "registered"=>"2016-03-12", "updated"=>"2020-09-18")
45
- expect(subject.publisher).to eq("name"=>"KNB Data Repository")
39
+ expect(subject.contributors).to eq([{ "contributorRoles" => ["Author"],
40
+ "name" => "Jones, Matthew B.; Slaughter, Peter; Nahf, Rob; Boettiger, Carl ; Jones, Chris; Read, Jordan; Walker, Lauren; Hart, Edmund; Chamberlain, Scott",
41
+ "type" => "Organization" }])
42
+ expect(subject.titles).to eq([{ "title" => "dataone: R interface to the DataONE network of data repositories" }])
43
+ expect(subject.date).to eq("created" => "2016-03-12", "published" => "2016", "registered" => "2016-03-12", "updated" => "2020-09-18")
44
+ expect(subject.publisher).to eq("name" => "KNB Data Repository")
46
45
  expect(subject.provider).to eq("DataCite")
47
46
  end
48
47
 
@@ -70,8 +69,8 @@ describe Commonmeta::Metadata, vcr: true do
70
69
  expect(subject.type).to eq("Dissertation")
71
70
  expect(subject.contributors.length).to eq(3)
72
71
  expect(subject.contributors.first).to eq("type" => "Person", "contributorRoles" => ["Author"],
73
- "givenName" => "Heiko", "familyName" => "Conrad")
74
- expect(subject.contributors.last).to eq("id"=>"https://orcid.org/0000-0002-8633-8234", "type"=>"Person", "contributorRoles"=>["Supervision"], "givenName"=>"Gerhard", "familyName"=>"Gruebel", "affiliation"=>[{"name"=>"Deutsches Elektronen-Synchrotron"}])
72
+ "givenName" => "Heiko", "familyName" => "Conrad")
73
+ expect(subject.contributors.last).to eq("id" => "https://orcid.org/0000-0002-8633-8234", "type" => "Person", "contributorRoles" => ["Supervision"], "givenName" => "Gerhard", "familyName" => "Gruebel", "affiliation" => [{ "name" => "Deutsches Elektronen-Synchrotron" }])
75
74
  expect(subject.titles).to eq([{ "title" => "Dynamics of colloids in molecular glass forming liquids studied via X-ray photon correlation spectroscopy" }])
76
75
  expect(subject.date).to eq("created" => "2018-01-25", "published" => "2014",
77
76
  "registered" => "2018-01-25", "updated" => "2020-09-19")
@@ -91,7 +90,7 @@ describe Commonmeta::Metadata, vcr: true do
91
90
  "affiliation" => [{ "name" => "Тверская государственная сельскохозяйственная академия" }], "familyName" => "Ганичева", "givenName" => "А.В.", "type" => "Person", "contributorRoles" => ["Author"],
92
91
  )
93
92
  expect(subject.titles.last).to eq("title" => "MODEL OF SYSTEM DYNAMICS OF PROCESS OF TRAINING",
94
- "titleType" => "TranslatedTitle")
93
+ "type" => "TranslatedTitle")
95
94
  expect(subject.date).to eq("created" => "2019-02-12", "published" => "2019",
96
95
  "registered" => "2019-02-12", "updated" => "2022-08-23")
97
96
  expect(subject.publisher).to eq("name" => "МОДЕЛИРОВАНИЕ, ОПТИМИЗАЦИЯ И ИНФОРМАЦИОННЫЕ ТЕХНОЛОГИИ")
@@ -115,10 +114,14 @@ describe Commonmeta::Metadata, vcr: true do
115
114
  expect(subject.contributors.first).to eq(
116
115
  "name" => "Europäische Kommission", "contributorRoles" => ["Author"], "type" => "Organization",
117
116
  )
118
- expect(subject.titles).to eq([
119
- { "lang" => "de",
120
- "title" => "Flash Eurobarometer 54 (Madrid Summit)" }, { "lang" => "en", "title" => "Flash Eurobarometer 54 (Madrid Summit)" }, { "titleType" => "Subtitle", "lang" => "de", "title" => "The Common European Currency" }, { "titleType" => "Subtitle", "lang" => "en", "title" => "The Common European Currency" },
121
- ])
117
+ expect(subject.titles).to eq([{ "language" => "de", "title" => "Flash Eurobarometer 54 (Madrid Summit)" },
118
+ { "language" => "en", "title" => "Flash Eurobarometer 54 (Madrid Summit)" },
119
+ { "language" => "de",
120
+ "title" => "The Common European Currency",
121
+ "type" => "Subtitle" },
122
+ { "language" => "en",
123
+ "title" => "The Common European Currency",
124
+ "type" => "Subtitle" }])
122
125
  expect(subject.subjects).to eq([{ "lang" => "en",
123
126
  "subject" => "KAT12 International Institutions, Relations, Conditions",
124
127
  "subjectScheme" => "ZA" },
@@ -155,5 +158,56 @@ describe Commonmeta::Metadata, vcr: true do
155
158
  expect(subject.license).to eq("id" => "CC-BY-4.0",
156
159
  "url" => "https://creativecommons.org/licenses/by/4.0/legalcode")
157
160
  end
161
+
162
+ it "dataset schema v4.5" do
163
+ input = "#{fixture_path}datacite-dataset_v4.5.json"
164
+ subject = described_class.new(input: input)
165
+ expect(subject.id).to eq("https://doi.org/10.82433/b09z-4k37")
166
+ expect(subject.type).to eq("Dataset")
167
+ expect(subject.contributors.length).to eq(23)
168
+ expect(subject.contributors[0]).to eq("contributorRoles" => ["Author"], "familyName" => "ExampleFamilyName", "givenName" => "ExampleGivenName", "type" => "Person")
169
+ expect(subject.contributors[2]).to eq("contributorRoles" => ["ContactPerson"], "familyName" => "ExampleFamilyName", "givenName" => "ExampleGivenName", "type" => "Person")
170
+ expect(subject.date).to eq("created" => "2022-10-27", "published" => "2022", "registered" => "2022-10-27", "updated" => "2024-01-02")
171
+ expect(subject.publisher).to eq("name" => "Example Publisher")
172
+ expect(subject.titles).to eq([{ "language" => "en", "title" => "Example Title" },
173
+ { "language" => "en", "title" => "Example Subtitle", "type" => "Subtitle" },
174
+ { "language" => "fr",
175
+ "title" => "Example TranslatedTitle",
176
+ "type" => "TranslatedTitle" },
177
+ { "language" => "en",
178
+ "title" => "Example AlternativeTitle",
179
+ "type" => "AlternativeTitle" }])
180
+ expect(subject.descriptions).to eq([{ "description" => "Example Abstract",
181
+ "type" => "Abstract",
182
+ "language" => "en" },
183
+ { "description" => "Example Methods",
184
+ "type" => "Methods",
185
+ "language" => "en" },
186
+ { "description" => "Example SeriesInformation",
187
+ "type" => "Other",
188
+ "language" => "en" },
189
+ { "description" => "Example TableOfContents",
190
+ "type" => "Other",
191
+ "language" => "en" },
192
+ { "description" => "Example TechnicalInfo",
193
+ "type" => "TechnicalInfo",
194
+ "language" => "en" },
195
+ { "description" => "Example Other", "type" => "Other", "language" => "en" }])
196
+ expect(subject.license).to eq("id" => "CC-PDDC", "url" => "https://creativecommons.org/licenses/publicdomain/")
197
+ end
198
+
199
+ it "instrument" do
200
+ input = "#{fixture_path}datacite-instrument.json"
201
+ subject = described_class.new(input: input)
202
+ puts subject.errors unless subject.valid?
203
+ expect(subject.valid?).to be true
204
+ expect(subject.id).to eq("https://doi.org/10.82433/08qf-ee96")
205
+ expect(subject.type).to eq("Instrument")
206
+ expect(subject.contributors.length).to eq(2)
207
+ expect(subject.contributors.first).to eq("contributorRoles" => ["Author"], "name" => "DECTRIS", "type" => "Organization", "id" => "https://www.wikidata.org/wiki/Q107529885")
208
+ expect(subject.date).to eq("created" => "2022-10-20", "published" => "2022", "registered" => "2022-10-20", "updated" => "2024-01-02")
209
+ expect(subject.publisher).to eq("name" => "Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences")
210
+ expect(subject.license).to be_nil
211
+ end
158
212
  end
159
213
  end
@@ -189,6 +189,31 @@ describe Commonmeta::Metadata, vcr: true do
189
189
  expect(subject.references).to be_nil
190
190
  end
191
191
 
192
+ it "medium post with institutional author" do
193
+ input = "https://api.rogue-scholar.org/posts/05f01f68-ef81-47d7-a3c1-40aba91d358f"
194
+ subject = described_class.new(input: input)
195
+ # expect(subject.valid?).to be true
196
+ expect(subject.id).to eq("https://doi.org/10.59350/jhrs4-22440")
197
+ expect(subject.url).to eq("https://medium.com/@researchgraph/unveiling-the-synergy-retrieval-augmented-generation-rag-meets-knowledge-graphs-fc0a6900f7eb")
198
+ expect(subject.alternate_identifiers).to eq([{ "alternateIdentifier" => "05f01f68-ef81-47d7-a3c1-40aba91d358f", "alternateIdentifierType" => "UUID" }])
199
+ expect(subject.type).to eq("Article")
200
+ expect(subject.contributors.length).to eq(1)
201
+ expect(subject.contributors.first).to eq("contributorRoles"=>["Author"], "name"=>"Research Graph", "type"=>"Organization")
202
+ expect(subject.titles).to eq([{ "title" => "Unveiling the Synergy: Retrieval Augmented Generation (RAG) Meets Knowledge Graphs" }])
203
+ expect(subject.license).to eq("id" => "CC-BY-4.0",
204
+ "url" => "https://creativecommons.org/licenses/by/4.0/legalcode")
205
+ expect(subject.date).to eq("published"=>"2024-01-18", "updated"=>"2024-01-18")
206
+ expect(subject.descriptions.first["description"]).to start_with("<strong> Tools and Platform for Integration of Knowledge Graph with RAG pipelines.")
207
+ expect(subject.publisher).to eq("name" => "Research Graph")
208
+ expect(subject.subjects).to eq([{ "subject" => "Computer and information sciences" },
209
+ { "schemeUri" => "http://www.oecd.org/science/inno/38235147.pdf",
210
+ "subject" => "FOS: Computer and information sciences",
211
+ "subjectScheme" => "Fields of Science and Technology (FOS)" }])
212
+ expect(subject.language).to eq("en")
213
+ expect(subject.container).to eq("identifier" => "https://medium.com/@researchgraph", "identifierType" => "URL", "title" => "Research Graph", "type" => "Periodical")
214
+ expect(subject.references).to be_nil
215
+ end
216
+
192
217
  it "syldavia gazette post with references" do
193
218
  input = "https://api.rogue-scholar.org/posts/0022b9ef-525a-4a79-81ad-13411697f58a"
194
219
  subject = described_class.new(input: input)
data/spec/utils_spec.rb CHANGED
@@ -502,7 +502,7 @@ describe Commonmeta::Metadata do
502
502
  links = [{ "rel" => "self", "type" => "application/atom+xml", "href" => "https://syldavia-gazette.org/atom/" },
503
503
  { "rel" => "alternate", "type" => "text/html", "href" => "https://syldavia-gazette.org" },
504
504
  { "rel" => "license", "type" => "text/html", "href" => "https://creativecommons.org/licenses/by/4.0/legalcode" }]
505
-
505
+
506
506
  it "url" do
507
507
  response = subject.get_link(links, "self")
508
508
  expect(response).to eq("https://syldavia-gazette.org/atom/")
@@ -721,7 +721,7 @@ describe Commonmeta::Metadata do
721
721
  it "decode doi to uuid" do
722
722
  doi = "https://doi.org/10.53731/6315bn4-aqg82ja-4a9wxdt-29f7279"
723
723
  response = subject.decode_doi(doi, uuid: true)
724
- expect(response).to eq('255d48ab-c102-9288-a4f3-add092f388e9')
724
+ expect(response).to eq("255d48ab-c102-9288-a4f3-add092f388e9")
725
725
  end
726
726
  end
727
727
 
@@ -745,10 +745,36 @@ describe Commonmeta::Metadata do
745
745
  end
746
746
  end
747
747
 
748
- context 'json_feed_unregistered_url' do
749
- it 'all posts' do
748
+ context "json_feed_unregistered_url" do
749
+ it "all posts" do
750
750
  response = subject.json_feed_unregistered_url
751
751
  expect(response).to eq("https://api.rogue-scholar.org/posts/unregistered")
752
752
  end
753
753
  end
754
+
755
+ context "normalize_name_identifier" do
756
+ it "ORCID" do
757
+ hsh = {"schemeUri"=>"https://orcid.org", "nameIdentifier"=>"https://orcid.org/0000-0003-1419-2405", "nameIdentifierScheme"=>"ORCID"}
758
+ response = subject.normalize_name_identifier(hsh)
759
+ expect(response).to eq("https://orcid.org/0000-0003-1419-2405")
760
+ end
761
+
762
+ it "ROR" do
763
+ hsh = { "schemeUri" => "https://ror.org", "nameIdentifier" => "https://ror.org/02aj13c28", "nameIdentifierScheme" => "ROR" }
764
+ response = subject.normalize_name_identifier(hsh)
765
+ expect(response).to eq("https://ror.org/02aj13c28")
766
+ end
767
+
768
+ it "ISNI" do
769
+ hsh = { "schemeUri" => "http://isni.org/isni/", "nameIdentifier" => "0000000134596520", "nameIdentifierScheme" => "ISNI" }
770
+ response = subject.normalize_name_identifier(hsh)
771
+ expect(response).to eq("https://isni.org/isni/0000000134596520")
772
+ end
773
+
774
+ it "Wikidata" do
775
+ hsh = {"schemeUri"=>"https://www.wikidata.org/wiki/", "nameIdentifier"=>"Q107529885", "nameIdentifierScheme"=>"Wikidata"}
776
+ response = subject.normalize_name_identifier(hsh)
777
+ expect(response).to eq("https://www.wikidata.org/wiki/Q107529885")
778
+ end
779
+ end
754
780
  end
@@ -33,11 +33,38 @@ describe Commonmeta::Metadata, vcr: true do
33
33
  "volume" => "426",
34
34
  "firstPage" => "181",
35
35
  "containerTitle" => "Nature")
36
- expect(json["date"]).to eq("published"=>"2014-02-11", "updated"=>"2022-03-26")
36
+ expect(json["date"]).to eq("published" => "2014-02-11", "updated" => "2022-03-26")
37
37
  expect(json["descriptions"].first["description"]).to start_with("Among various advantages,")
38
- expect(json["license"]).to eq("id"=>"CC-BY-3.0", "url"=>"https://creativecommons.org/licenses/by/3.0/legalcode")
38
+ expect(json["license"]).to eq("id" => "CC-BY-3.0", "url" => "https://creativecommons.org/licenses/by/3.0/legalcode")
39
39
  expect(json["provider"]).to eq("Crossref")
40
- expect(json["files"].first).to eq("mimeType"=>"application/pdf", "url"=>"https://cdn.elifesciences.org/articles/01567/elife-01567-v1.pdf")
40
+ expect(json["files"].first).to eq("mimeType" => "application/pdf", "url" => "https://cdn.elifesciences.org/articles/01567/elife-01567-v1.pdf")
41
+ end
42
+
43
+ it "dataset schema v4.5" do
44
+ input = "#{fixture_path}datacite-dataset_v4.5.json"
45
+ subject = described_class.new(input: input)
46
+ expect(subject.id).to eq("https://doi.org/10.82433/b09z-4k37")
47
+ json = JSON.parse(subject.commonmeta)
48
+ expect(json["id"]).to eq("https://doi.org/10.82433/b09z-4k37")
49
+ expect(json["type"]).to eq("Dataset")
50
+ expect(json["titles"]).to eq([{ "language" => "en", "title" => "Example Title" },
51
+ { "language" => "en", "title" => "Example Subtitle", "type" => "Subtitle" },
52
+ { "language" => "fr",
53
+ "title" => "Example TranslatedTitle",
54
+ "type" => "TranslatedTitle" },
55
+ { "language" => "en",
56
+ "title" => "Example AlternativeTitle",
57
+ "type" => "AlternativeTitle" }])
58
+ expect(json["descriptions"]).to eq([{ "description" => "Example Abstract", "language" => "en", "type" => "Abstract" },
59
+ { "description" => "Example Methods", "language" => "en", "type" => "Methods" },
60
+ { "description" => "Example SeriesInformation",
61
+ "language" => "en",
62
+ "type" => "Other" },
63
+ { "description" => "Example TableOfContents", "language" => "en", "type" => "Other" },
64
+ { "description" => "Example TechnicalInfo",
65
+ "language" => "en",
66
+ "type" => "TechnicalInfo" },
67
+ { "description" => "Example Other", "language" => "en", "type" => "Other" }])
41
68
  end
42
69
  end
43
70
  end
@@ -7,6 +7,7 @@ describe Commonmeta::Metadata, vcr: true do
7
7
  it 'Dataset' do
8
8
  input = 'https://doi.org/10.5061/DRYAD.8515'
9
9
  subject = described_class.new(input: input, from: 'datacite')
10
+ puts subject.errors unless subject.valid?
10
11
  expect(subject.valid?).to be true
11
12
  json = JSON.parse(subject.csl)
12
13
  expect(json['type']).to eq('dataset')
@@ -37,6 +37,7 @@ describe Commonmeta::Metadata, vcr: true do
37
37
  it 'text' do
38
38
  input = 'https://doi.org/10.3204/desy-2014-01645'
39
39
  subject = described_class.new(input: input, from: 'datacite')
40
+ puts subject.errors unless subject.valid?
40
41
  expect(subject.valid?).to be true
41
42
  csv = subject.csv.parse_csv
42
43
 
@@ -136,7 +136,6 @@ describe Commonmeta::Metadata, vcr: true do
136
136
  it 'from schema.org' do
137
137
  input = 'https://blog.front-matter.io/posts/eating-your-own-dog-food/'
138
138
  subject = described_class.new(input: input, from: 'schema_org')
139
- puts subject.errors
140
139
  expect(subject.valid?).to be true
141
140
  datacite = JSON.parse(subject.datacite)
142
141
  expect(datacite.fetch('titles')).to eq([{ 'title' => 'Eating your own Dog Food' }])