commonmeta-ruby 3.11.0 → 3.12.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +5 -4
- data/lib/commonmeta/author_utils.rb +9 -6
- data/lib/commonmeta/readers/datacite_reader.rb +117 -111
- data/lib/commonmeta/schema_utils.rb +1 -1
- data/lib/commonmeta/version.rb +1 -1
- data/lib/commonmeta/writers/commonmeta_writer.rb +1 -1
- data/resources/{commonmeta_v0.10.6.json → commonmeta_v0.10.7.json} +11 -3
- data/spec/author_utils_spec.rb +15 -0
- data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/medium_post_with_institutional_author.yml +317 -0
- data/spec/readers/datacite_reader_spec.rb +36 -7
- data/spec/readers/json_feed_reader_spec.rb +25 -0
- data/spec/writers/commonmeta_writer_spec.rb +30 -3
- data/spec/writers/csl_writer_spec.rb +1 -0
- data/spec/writers/csv_writer_spec.rb +1 -0
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b48e44936ddd71a38a9d019c33972d02cf66af57c02542f48d37a022017c8208
|
4
|
+
data.tar.gz: deb5ca7a1b1ec9387583e0039edc45ab382d8989c31012dacd81560c8cbaeaa3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e72bf66b0e72b62640d6f528c2279b119499a225acbe26498efe2afc7c5679b018175097144f6fef3a592186ae518e4073b9f209a31a56bda82e697fc3287408
|
7
|
+
data.tar.gz: '099621bbc5109437cf592a34bb8810d6d7c6a26c68bec0e13c43061410066218bf38fc0a3f809a84dc9f2861075a2b6a53d41836da9c2e909e5096529b7f87cd'
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
commonmeta-ruby (3.
|
4
|
+
commonmeta-ruby (3.12.1)
|
5
5
|
activesupport (>= 4.2.5, < 8.0)
|
6
6
|
addressable (~> 2.8.1, < 2.8.2)
|
7
7
|
base32-url (>= 0.7.0, < 1)
|
@@ -58,7 +58,8 @@ GEM
|
|
58
58
|
rubocop (~> 1.0)
|
59
59
|
concurrent-ruby (1.2.3)
|
60
60
|
connection_pool (2.4.1)
|
61
|
-
crack (0.4.
|
61
|
+
crack (0.4.6)
|
62
|
+
bigdecimal
|
62
63
|
rexml
|
63
64
|
crass (1.0.6)
|
64
65
|
csl (2.0.0)
|
@@ -66,7 +67,7 @@ GEM
|
|
66
67
|
rexml
|
67
68
|
csl-styles (2.0.1)
|
68
69
|
csl (~> 2.0)
|
69
|
-
diff-lcs (1.5.
|
70
|
+
diff-lcs (1.5.1)
|
70
71
|
docile (1.4.0)
|
71
72
|
domain_name (0.6.20240107)
|
72
73
|
drb (2.2.0)
|
@@ -154,7 +155,7 @@ GEM
|
|
154
155
|
iniparser (>= 0.1.0)
|
155
156
|
public_suffix (4.0.7)
|
156
157
|
racc (1.7.3)
|
157
|
-
rack (3.0.
|
158
|
+
rack (3.0.9)
|
158
159
|
rack-test (2.1.0)
|
159
160
|
rack (>= 1.3)
|
160
161
|
rainbow (3.1.1)
|
@@ -25,8 +25,8 @@ module Commonmeta
|
|
25
25
|
"Researcher" => "Other",
|
26
26
|
"Sponsor" => "Other",
|
27
27
|
"Supervisor" => "Supervision",
|
28
|
-
"WorkPackageLeader" => "Other"
|
29
|
-
}
|
28
|
+
"WorkPackageLeader" => "Other",
|
29
|
+
}
|
30
30
|
|
31
31
|
def get_one_author(author)
|
32
32
|
# basic sanity checks
|
@@ -55,20 +55,20 @@ module Commonmeta
|
|
55
55
|
parse_attributes(author.fetch("identifier", nil), first: true) ||
|
56
56
|
parse_attributes(author.fetch("sameAs", nil), first: true)
|
57
57
|
id = normalize_orcid(id) || normalize_ror(id) if id.present?
|
58
|
-
|
58
|
+
|
59
59
|
# DataCite metadata
|
60
60
|
if id.nil? && author["nameIdentifiers"].present?
|
61
61
|
id = Array.wrap(author.dig("nameIdentifiers")).find do |ni|
|
62
62
|
normalize_name_identifier(ni).present?
|
63
63
|
end
|
64
64
|
id = normalize_name_identifier(id) if id.present?
|
65
|
-
|
65
|
+
# Crossref metadata
|
66
66
|
elsif id.nil? && author["ORCID"].present?
|
67
67
|
id = author.fetch("ORCID")
|
68
68
|
id = normalize_orcid(id)
|
69
|
-
|
69
|
+
# JSON Feed metadata
|
70
70
|
elsif id.nil? && author["url"].present?
|
71
|
-
id = author.fetch("url")
|
71
|
+
id = author.fetch("url")
|
72
72
|
end
|
73
73
|
|
74
74
|
# parse author type, i.e. "Person", "Organization" or not specified
|
@@ -168,6 +168,9 @@ module Commonmeta
|
|
168
168
|
# check if a name has only one word, e.g. "FamousOrganization", not including commas
|
169
169
|
return false if name.to_s.split(" ").size == 1 && name.to_s.exclude?(",")
|
170
170
|
|
171
|
+
# check if name contains words known to be used in organization names
|
172
|
+
return false if %w[University College Institute School Center Department Laboratory Library Museum Foundation Society Association Company Corporation Collaboration Consortium Incorporated Inc. Institut Research Science Team].any? { |word| name.to_s.include?(word) }
|
173
|
+
|
171
174
|
# check for suffixes, e.g. "John Smith, MD"
|
172
175
|
return true if name && %w[MD PhD].include?(name.split(", ").last)
|
173
176
|
|
@@ -4,29 +4,29 @@ module Commonmeta
|
|
4
4
|
module Readers
|
5
5
|
module DataciteReader
|
6
6
|
def get_datacite(id: nil, **options)
|
7
|
-
return {
|
7
|
+
return { "string" => nil, "state" => "not_found" } unless id.present?
|
8
8
|
|
9
9
|
api_url = datacite_api_url(id, options)
|
10
10
|
response = HTTP.get(api_url)
|
11
|
-
return {
|
11
|
+
return { "string" => nil, "state" => "not_found" } unless response.status.success?
|
12
12
|
|
13
13
|
body = JSON.parse(response.body)
|
14
|
-
client = Array.wrap(body.fetch(
|
15
|
-
m[
|
14
|
+
client = Array.wrap(body.fetch("included", nil)).find do |m|
|
15
|
+
m["type"] == "clients"
|
16
16
|
end
|
17
|
-
client_id = client.to_h.fetch(
|
18
|
-
provider_id = Array.wrap(client.to_h.fetch(
|
19
|
-
m[
|
20
|
-
end.to_h.dig(
|
21
|
-
|
22
|
-
{
|
23
|
-
|
24
|
-
|
17
|
+
client_id = client.to_h.fetch("id", nil)
|
18
|
+
provider_id = Array.wrap(client.to_h.fetch("relationships", nil)).find do |m|
|
19
|
+
m["provider"].present?
|
20
|
+
end.to_h.dig("provider", "data", "id")
|
21
|
+
|
22
|
+
{ "string" => response.body.to_s,
|
23
|
+
"provider_id" => provider_id,
|
24
|
+
"client_id" => client_id }
|
25
25
|
end
|
26
26
|
|
27
27
|
def read_datacite(string: nil, **_options)
|
28
28
|
errors = jsonlint(string)
|
29
|
-
return {
|
29
|
+
return { "errors" => errors } if errors.present?
|
30
30
|
|
31
31
|
read_options = ActiveSupport::HashWithIndifferentAccess.new(_options.except(:doi, :id, :url,
|
32
32
|
:sandbox, :validate, :ra))
|
@@ -34,140 +34,146 @@ module Commonmeta
|
|
34
34
|
meta = string.present? ? JSON.parse(string) : {}
|
35
35
|
|
36
36
|
# optionally strip out the message wrapper from API
|
37
|
-
meta = meta.dig(
|
37
|
+
meta = meta.dig("data", "attributes") if meta.dig("data").present?
|
38
38
|
|
39
39
|
meta.transform_keys!(&:underscore)
|
40
40
|
|
41
|
-
id = normalize_doi(meta.fetch(
|
41
|
+
id = normalize_doi(meta.fetch("doi", nil))
|
42
42
|
|
43
|
-
resource_type_general = meta.dig(
|
44
|
-
resource_type = meta.dig(
|
43
|
+
resource_type_general = meta.dig("types", "resourceTypeGeneral")
|
44
|
+
resource_type = meta.dig("types", "resourceType")
|
45
45
|
# if resource_type is one of the new resource_type_general types introduced in schema 4.3, use it
|
46
46
|
type = Commonmeta::Utils::DC_TO_CM_TRANSLATIONS.fetch(resource_type, nil) ||
|
47
|
-
Commonmeta::Utils::DC_TO_CM_TRANSLATIONS.fetch(resource_type_general,
|
47
|
+
Commonmeta::Utils::DC_TO_CM_TRANSLATIONS.fetch(resource_type_general, "Other")
|
48
48
|
|
49
|
-
alternate_identifiers = Array.wrap(meta.fetch(
|
49
|
+
alternate_identifiers = Array.wrap(meta.fetch("alternate_identifiers", nil)).map do |i|
|
50
50
|
i.transform_keys! { |k| k.camelize(:lower) }
|
51
51
|
end
|
52
|
-
url = meta.fetch(
|
53
|
-
titles = Array.wrap(meta.fetch(
|
54
|
-
title.
|
52
|
+
url = meta.fetch("url", nil)
|
53
|
+
titles = Array.wrap(meta.fetch("titles", nil)).map do |title|
|
54
|
+
{ "title" => title.fetch("title", nil),
|
55
|
+
"type" => title.fetch("titleType", nil),
|
56
|
+
"language" => title.fetch("lang", nil) }.compact
|
55
57
|
end
|
56
|
-
contributors = get_authors(from_datacite(meta.fetch(
|
57
|
-
contributors += get_authors(from_datacite(meta.fetch(
|
58
|
-
if meta.fetch(
|
59
|
-
publisher = {
|
60
|
-
elsif meta.fetch(
|
61
|
-
publisher = {
|
58
|
+
contributors = get_authors(from_datacite(meta.fetch("creators", nil)))
|
59
|
+
contributors += get_authors(from_datacite(meta.fetch("contributors", nil)))
|
60
|
+
if meta.fetch("publisher", nil).is_a?(Hash)
|
61
|
+
publisher = { "name" => meta.fetch("publisher", nil).fetch("name", nil) }
|
62
|
+
elsif meta.fetch("publisher", nil).is_a?(String)
|
63
|
+
publisher = { "name" => meta.fetch("publisher", nil) }
|
62
64
|
else
|
63
65
|
publisher = nil
|
64
66
|
end
|
65
67
|
|
66
|
-
container = meta.fetch(
|
67
|
-
funding_references = meta.fetch(
|
68
|
+
container = meta.fetch("container", nil)
|
69
|
+
funding_references = meta.fetch("funding_references", nil)
|
68
70
|
|
69
71
|
date = {}
|
70
|
-
date[
|
71
|
-
get_iso8601_date(meta.dig(
|
72
|
-
date[
|
73
|
-
get_iso8601_date(meta.dig(
|
74
|
-
|
75
|
-
date[
|
76
|
-
date[
|
77
|
-
get_iso8601_date(meta.dig(
|
78
|
-
|
79
|
-
descriptions = Array.wrap(meta.fetch(
|
80
|
-
description.
|
72
|
+
date["created"] =
|
73
|
+
get_iso8601_date(meta.dig("created")) || get_date(meta.dig("dates"), "Created")
|
74
|
+
date["published"] =
|
75
|
+
get_iso8601_date(meta.dig("published")) || get_date(meta.dig("dates"),
|
76
|
+
"Issued") || get_iso8601_date(meta.dig("publication_year"))
|
77
|
+
date["registered"] = get_iso8601_date(meta.dig("registered"))
|
78
|
+
date["updated"] =
|
79
|
+
get_iso8601_date(meta.dig("updated")) || get_date(meta.dig("dates"), "Updated")
|
80
|
+
|
81
|
+
descriptions = Array.wrap(meta.fetch("descriptions", nil)).map do |description|
|
82
|
+
description_type = description.fetch("descriptionType", nil)
|
83
|
+
description_type = "Other" unless %w[Abstract Methods TechnicalInfo].include?(description_type)
|
84
|
+
{ "description" => description.fetch("description", nil),
|
85
|
+
"type" => description_type,
|
86
|
+
"language" => description.fetch("lang", nil) }.compact
|
81
87
|
end
|
82
|
-
license = Array.wrap(meta.fetch(
|
83
|
-
r[
|
88
|
+
license = Array.wrap(meta.fetch("rights_list", nil)).find do |r|
|
89
|
+
r["rightsUri"].present?
|
84
90
|
end
|
85
|
-
license = hsh_to_spdx(
|
86
|
-
version = meta.fetch(
|
87
|
-
subjects = meta.fetch(
|
88
|
-
language = meta.fetch(
|
89
|
-
geo_locations = meta.fetch(
|
90
|
-
references = (Array.wrap(meta.fetch(
|
91
|
-
nil)) + Array.wrap(meta.fetch(
|
91
|
+
license = hsh_to_spdx("rightsURI" => license["rightsUri"]) if license.present?
|
92
|
+
version = meta.fetch("version", nil)
|
93
|
+
subjects = meta.fetch("subjects", nil)
|
94
|
+
language = meta.fetch("language", nil)
|
95
|
+
geo_locations = meta.fetch("geo_locations", nil)
|
96
|
+
references = (Array.wrap(meta.fetch("related_identifiers",
|
97
|
+
nil)) + Array.wrap(meta.fetch("related_items",
|
92
98
|
nil))).select do |r|
|
93
|
-
|
94
|
-
|
99
|
+
%w[References Cites IsSupplementedBy].include?(r["relationType"])
|
100
|
+
end.map do |reference|
|
95
101
|
get_datacite_reference(reference)
|
96
102
|
end
|
97
|
-
files = Array.wrap(meta.fetch("content_url", nil)).map { |file| { "url" => file } }
|
98
|
-
formats = meta.fetch(
|
99
|
-
sizes = meta.fetch(
|
100
|
-
schema_version = meta.fetch(
|
101
|
-
state = id.present? || read_options.present? ?
|
102
|
-
|
103
|
-
{
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
103
|
+
files = Array.wrap(meta.fetch("content_url", nil)).map { |file| { "url" => file } }
|
104
|
+
formats = meta.fetch("formats", nil)
|
105
|
+
sizes = meta.fetch("sizes", nil)
|
106
|
+
schema_version = meta.fetch("schema_version", nil) || "http://datacite.org/schema/kernel-4"
|
107
|
+
state = id.present? || read_options.present? ? "findable" : "not_found"
|
108
|
+
|
109
|
+
{ "id" => id,
|
110
|
+
"type" => type,
|
111
|
+
"additional_type" => resource_type == type ? nil : resource_type,
|
112
|
+
"url" => url,
|
113
|
+
"titles" => titles,
|
114
|
+
"contributors" => contributors,
|
115
|
+
"container" => container,
|
116
|
+
"publisher" => publisher,
|
117
|
+
"provider" => "DataCite",
|
118
|
+
"alternate_identifiers" => alternate_identifiers.presence,
|
119
|
+
"references" => references,
|
120
|
+
"funding_references" => funding_references,
|
121
|
+
"files" => files.presence,
|
122
|
+
"date" => date.compact,
|
123
|
+
"descriptions" => descriptions,
|
124
|
+
"license" => license,
|
125
|
+
"version" => version,
|
126
|
+
"subjects" => subjects,
|
127
|
+
"language" => language,
|
128
|
+
"geo_locations" => geo_locations,
|
129
|
+
"formats" => formats,
|
130
|
+
"sizes" => sizes,
|
131
|
+
"state" => state }.compact # .merge(read_options)
|
126
132
|
end
|
127
133
|
|
128
134
|
def format_contributor(contributor)
|
129
|
-
type = contributor.fetch(
|
130
|
-
|
131
|
-
{
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
135
|
+
type = contributor.fetch("nameType", nil)
|
136
|
+
|
137
|
+
{ "name" => type == "Person" ? nil : contributor.fetch("name", nil),
|
138
|
+
"type" => type,
|
139
|
+
"givenName" => contributor.fetch("givenName", nil),
|
140
|
+
"familyName" => contributor.fetch("familyName", nil),
|
141
|
+
"nameIdentifiers" => contributor.fetch("nameIdentifiers", nil).presence,
|
142
|
+
"affiliations" => contributor.fetch("affiliations", nil).presence,
|
143
|
+
"contributorType" => contributor.fetch("contributorType", nil) }.compact
|
138
144
|
end
|
139
145
|
|
140
146
|
def get_datacite_reference(reference)
|
141
147
|
return nil unless reference.present? || !reference.is_a?(Hash)
|
142
148
|
|
143
|
-
key = reference[
|
149
|
+
key = reference["relatedIdentifier"]
|
144
150
|
doi = nil
|
145
151
|
url = nil
|
146
152
|
|
147
|
-
case reference[
|
148
|
-
when
|
149
|
-
doi = normalize_doi(reference[
|
150
|
-
when
|
151
|
-
url = reference[
|
153
|
+
case reference["relatedIdentifierType"]
|
154
|
+
when "DOI"
|
155
|
+
doi = normalize_doi(reference["relatedIdentifier"])
|
156
|
+
when "URL"
|
157
|
+
url = reference["relatedIdentifier"]
|
152
158
|
else
|
153
|
-
url = reference[
|
159
|
+
url = reference["relatedIdentifier"]
|
154
160
|
end
|
155
161
|
|
156
162
|
{
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
163
|
+
"key" => key,
|
164
|
+
"doi" => doi,
|
165
|
+
"url" => url,
|
166
|
+
"contributor" => reference.dig("author"),
|
167
|
+
"title" => reference.dig("article-title"),
|
168
|
+
"publisher" => reference.dig("publisher"),
|
169
|
+
"publicationYear" => reference.dig("year"),
|
170
|
+
"volume" => reference.dig("volume"),
|
171
|
+
"issue" => reference.dig("issue"),
|
172
|
+
"firstPage" => reference.dig("first-page"),
|
173
|
+
"lastPage" => reference.dig("last-page"),
|
174
|
+
"containerTitle" => reference.dig("journal-title"),
|
175
|
+
"edition" => nil,
|
176
|
+
"unstructured" => doi.nil? ? reference.dig("unstructured") : nil,
|
171
177
|
}.compact
|
172
178
|
end
|
173
179
|
end
|
@@ -5,7 +5,7 @@ require "pathname"
|
|
5
5
|
|
6
6
|
module Commonmeta
|
7
7
|
module SchemaUtils
|
8
|
-
COMMONMETA = File.read(File.expand_path("../../resources/commonmeta_v0.10.
|
8
|
+
COMMONMETA = File.read(File.expand_path("../../resources/commonmeta_v0.10.7.json",
|
9
9
|
__dir__))
|
10
10
|
|
11
11
|
def json_schema_errors
|
data/lib/commonmeta/version.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
{
|
2
2
|
"$schema": "http://json-schema.org/draft-07/schema#",
|
3
|
-
"$id": "https://commonmeta.org/commonmeta_v0.10.
|
4
|
-
"title": "Commonmeta v0.10.
|
3
|
+
"$id": "https://commonmeta.org/commonmeta_v0.10.7.json",
|
4
|
+
"title": "Commonmeta v0.10.7",
|
5
5
|
"description": "JSON representation of the Commonmeta schema.",
|
6
6
|
"additionalProperties": false,
|
7
7
|
"definitions": {
|
@@ -253,6 +253,10 @@
|
|
253
253
|
"description": "The type of the title.",
|
254
254
|
"type": "string",
|
255
255
|
"enum": ["AlternativeTitle", "Subtitle", "TranslatedTitle"]
|
256
|
+
},
|
257
|
+
"language": {
|
258
|
+
"description": "The language of the title. Use one of the language codes from the IETF BCP 47 standard.",
|
259
|
+
"type": "string"
|
256
260
|
}
|
257
261
|
},
|
258
262
|
"required": ["title"]
|
@@ -424,7 +428,11 @@
|
|
424
428
|
"type": {
|
425
429
|
"description": "The type of the description.",
|
426
430
|
"type": "string",
|
427
|
-
"enum": ["Abstract", "
|
431
|
+
"enum": ["Abstract", "Summary", "Methods", "TechnicalInfo", "Other"]
|
432
|
+
},
|
433
|
+
"language": {
|
434
|
+
"description": "The language of the title. Use one of the language codes from the IETF BCP 47 standard.",
|
435
|
+
"type": "string"
|
428
436
|
}
|
429
437
|
},
|
430
438
|
"required": ["description"]
|
data/spec/author_utils_spec.rb
CHANGED
@@ -58,6 +58,21 @@ describe Commonmeta::Metadata, vcr: true do
|
|
58
58
|
author = { "name" => "Tejas S. Sathe, MD" }
|
59
59
|
expect(subject.is_personal_name?(name: author["name"])).to be true
|
60
60
|
end
|
61
|
+
|
62
|
+
it "name with organization string" do
|
63
|
+
author = { "name" => "University of California, Santa Barbara" }
|
64
|
+
expect(subject.is_personal_name?(name: author["name"])).to be false
|
65
|
+
end
|
66
|
+
|
67
|
+
it "name with another organization string" do
|
68
|
+
author = { "name" => "Research Graph" }
|
69
|
+
expect(subject.is_personal_name?(name: author["name"])).to be false
|
70
|
+
end
|
71
|
+
|
72
|
+
it "name with ye another organization string" do
|
73
|
+
author = { "name" => "Team OA Brandenburg" }
|
74
|
+
expect(subject.is_personal_name?(name: author["name"])).to be false
|
75
|
+
end
|
61
76
|
end
|
62
77
|
|
63
78
|
context "cleanup_author" do
|
@@ -0,0 +1,317 @@
|
|
1
|
+
---
|
2
|
+
http_interactions:
|
3
|
+
- request:
|
4
|
+
method: get
|
5
|
+
uri: https://api.rogue-scholar.org/posts/05f01f68-ef81-47d7-a3c1-40aba91d358f
|
6
|
+
body:
|
7
|
+
encoding: ASCII-8BIT
|
8
|
+
string: ''
|
9
|
+
headers:
|
10
|
+
Connection:
|
11
|
+
- close
|
12
|
+
Host:
|
13
|
+
- api.rogue-scholar.org
|
14
|
+
User-Agent:
|
15
|
+
- http.rb/5.1.1
|
16
|
+
response:
|
17
|
+
status:
|
18
|
+
code: 200
|
19
|
+
message: OK
|
20
|
+
headers:
|
21
|
+
Content-Type:
|
22
|
+
- application/json
|
23
|
+
Content-Length:
|
24
|
+
- '23886'
|
25
|
+
Ratelimit-Limit:
|
26
|
+
- '15'
|
27
|
+
Ratelimit-Remaining:
|
28
|
+
- '14'
|
29
|
+
Ratelimit-Reset:
|
30
|
+
- '3'
|
31
|
+
Date:
|
32
|
+
- Wed, 31 Jan 2024 19:50:01 GMT
|
33
|
+
Server:
|
34
|
+
- Fly/ba9e227a (2024-01-26)
|
35
|
+
Via:
|
36
|
+
- 1.1 fly.io
|
37
|
+
Fly-Request-Id:
|
38
|
+
- 01HNGH4EZV3XQF20H1PZ6X5N07-fra
|
39
|
+
body:
|
40
|
+
encoding: UTF-8
|
41
|
+
string: '{"abstract":null,"archive_url":null,"authors":[{"name":"Research Graph"}],"blog":{"api":false,"archive_prefix":null,"authors":null,"backlog":0,"canonical_url":null,"category":"computerAndInformationSciences","created_at":1706685423,"current_feed_url":null,"description":"Stories
|
42
|
+
by Research Graph on Medium","favicon":"https://cdn-images-1.medium.com/fit/c/150/150/1*laJi0jBkVoGhXid7gD_DmQ.png","feed_format":"application/rss+xml","feed_url":"https://medium.com/@researchgraph/feed","filter":null,"funding":null,"generator":"Medium","generator_raw":"Medium","home_page_url":"https://medium.com/@researchgraph","id":"30da2ca9-8258-4ab5-acca-3919d9a5d98d","indexed":true,"issn":null,"language":"en","license":"https://creativecommons.org/licenses/by/4.0/legalcode","mastodon":"","plan":"Starter","prefix":"10.59350","relative_url":null,"ror":null,"secure":true,"slug":"researchgraph","status":"active","title":"Research
|
43
|
+
Graph","updated_at":1706151454,"use_api":null,"use_mastodon":false,"user_id":"a7e16958-1175-437c-b839-d4b8a47ec811","version":"https://jsonfeed.org/version/1.1"},"blog_name":"Research
|
44
|
+
Graph","blog_slug":"researchgraph","content_text":"**Tools and Platform for
|
45
|
+
Integration of Knowledge Graph with RAG\npipelines.**\n\n<figure>\n<img\nsrc=\"https://cdn-images-1.medium.com/max/1024/1*bJ3eWZ7301vYDzBomwdLfQ.png\"\nalt=\"Complex
|
46
|
+
network connected to books and showing information from magespace\" />\n<figcaption>Image
|
47
|
+
Created in <a\nhref=\"https://www.mage.space/\">https://www.mage.space/</a></figcaption>\n</figure>\n\nAuthors:
|
48
|
+
[Aland\nAstudillo](https://www.linkedin.com/in/aland-astudillo/), [Aishwarya\nNambissan](https://www.linkedin.com/in/aishwarya-nambissan-127229200/)\n\nMany
|
49
|
+
users of chatbots such as ChatGPT, have encountered the problem of\nreceiving
|
50
|
+
inappropriate or incompatible responses. There are several\nreasons why this
|
51
|
+
might\u00a0happen.\n\nOne reason is the lack of appropriate training data,
|
52
|
+
as chatbots are\nusually trained on large amounts of text and code. If the
|
53
|
+
data is\ninsufficient or of poor quality, the chatbot may misunderstand queries\nand
|
54
|
+
provide inaccurate responses. Another reason is that some chatbots\nare designed
|
55
|
+
for specific tasks or domains, which limits their ability\nto handle broader
|
56
|
+
queries or understand subtle nuances in conversation.\nAdditionally, chatbots
|
57
|
+
may struggle with natural language, which is\ncomplex and often ambiguous.
|
58
|
+
This can cause them to misunderstand a\nuser''s query and provide irrelevant
|
59
|
+
or off-topic responses. Finally,\nthere are technical limitations, such as
|
60
|
+
the chatbot''s inability to\nreason or make inferences.\n\nThis article explores
|
61
|
+
a potential solution by combining two influential\napproaches in the field
|
62
|
+
of Natural Language Processing\u200a---\u200aRetrieval\nAugmented Generation
|
63
|
+
(**RAG**) and Knowledge Graphs(**KGs**). We will\ndelve into the partnership
|
64
|
+
between these two entities, discuss the\nnotable technologies and software
|
65
|
+
used in their processes, and highlight\nvarious options for utilizing their
|
66
|
+
combined potential.\n\n### **RAG**\n\nRetrieval-Augmented Generation is the
|
67
|
+
process of optimizing the output\nof a large language model using a knowledge
|
68
|
+
base outside of its training\ndata sources before generating a response. It
|
69
|
+
takes an input and\nretrieves a set of relevant/supporting documents given
|
70
|
+
a source (e.g.,\nWikipedia). This can be thought of as a Large Language Model
|
71
|
+
(LLM) not\njust putting words together, but carefully selecting relevant\ninformation
|
72
|
+
from external sources and Knowledge Graphs to create\nwell-informed and detailed
|
73
|
+
responses.\n\n### RAG Retrieval Techniques\n\nThe following are some crucial
|
74
|
+
technologies that enable RAG''s impressive\nability to retrieve and incorporate
|
75
|
+
relevant information:\n\n**Vector Search**: It transforms text into numerical
|
76
|
+
vectors, capturing\ntheir meaning and nuances in a mathematical space, creating
|
77
|
+
a map of\nrelationships. Similar texts, like those discussing shared topics
|
78
|
+
or\nusing similar language, end up positioned close together in this space,\nallowing
|
79
|
+
vector search to quickly identify them as related. This allows\nlightning-fast
|
80
|
+
comparisons, finding similar texts based on meaning, not\njust keywords.\n\nAlgorithms
|
81
|
+
like [**Faiss**](https://github.com/facebookresearch/faiss)\nand [**Annoy**](https://github.com/spotify/annoy)
|
82
|
+
map text into dense\nvectors, enabling fast comparisons and retrieval of relevant
|
83
|
+
passages\nbased on semantic similarity.\n\n**Passage Ranking**: It is an internal
|
84
|
+
algorithm that scores candidate\ntext passages based on their relevance to
|
85
|
+
a query. It considers factors\nlike keyword frequency, keyword overlap, and
|
86
|
+
document structure to act\nlike a judge, sifting through information to select
|
87
|
+
the most fitting and\ninformative passages.\n\nKeyword overlap measures how
|
88
|
+
often the same keywords appear in **both**\nthe query and the candidate passage,
|
89
|
+
emphasizing shared vocabulary and\npotential relevance. It differs from keyword
|
90
|
+
frequency, which simply\ncounts how often individual keywords appear within
|
91
|
+
a passage, regardless\nof their presence in the\u00a0query.\n\nTechniques
|
92
|
+
like [**BM25**](https://github.com/getalp/wikIR) and\n[**TF-IDF**](https://github.com/marcocor/wikipedia-idf)
|
93
|
+
score candidate\npassages based on keyword overlap and frequency, ensuring
|
94
|
+
retrieved\ninformation truly fits the\u00a0context.\n\n**Graph Neural Networks**
|
95
|
+
(**GNNs**): They are neural networks designed\nto explore and learn from interconnected
|
96
|
+
data like maps, social\nnetworks, and other complex relationships. Unlike
|
97
|
+
traditional processing\nmethods that go through data in a linear fashion,
|
98
|
+
GNNs are capable of\nrecognizing hidden patterns and understanding relationships
|
99
|
+
like \"who\nknows who\" and \"what connects to what\" by \"hopping\" across
|
100
|
+
connections\nin\u00a0data.\n\nConsider a graph as a network of dots(nodes)
|
101
|
+
connected by lines (edges).\nEach dot represents some information, like a
|
102
|
+
person, object, or concept.\nThe lines tell you how these things relate to
|
103
|
+
each\u00a0other.\n\nGNNs work in rounds. In each\u00a0round:\n\n1. Message
|
104
|
+
Passing: Each node \"talks\" to its neighbors, sending\n messages along
|
105
|
+
the edges. These messages contain information about\n the node itself and
|
106
|
+
its features.\n2. Node Update: Each node receives messages from all its neighbors
|
107
|
+
and\n combines them with its own information. This update can involve\n calculations
|
108
|
+
and applying a special function.\n3. Output Calculation: Based on the updated
|
109
|
+
information, the network\n calculates an output for each node. This output
|
110
|
+
could be a\n prediction about the node''s category, its relationship to
|
111
|
+
another\n node, or some other relevant information.\n\nThis process repeats
|
112
|
+
for multiple rounds, allowing nodes to incorporate\ninformation from their
|
113
|
+
entire neighborhood, not just their direct\nneighbors. As the rounds progress,
|
114
|
+
the network learns to understand the\nrelationships between nodes and the
|
115
|
+
overall structure of the\u00a0graph.\n\nWhen dealing with Knowledge Graphs,
|
116
|
+
frameworks like\n[**PyTorch-Geometric**](https://readthedocs.org/projects/pytorch-geometric/)\nand
|
117
|
+
[**DeepMind''s\nGNN**](https://github.com/deepmind/deepmind-research/blob/master/learning_to_simulate/graph_network.py)\nlibrary
|
118
|
+
come into play. These frameworks allow GNNs to traverse\ninterconnected entities
|
119
|
+
and relationships within the graph, retrieve\nrelevant knowledge fragments,
|
120
|
+
and understand complex connections.\n\n### **Knowledge Graphs: The Structured
|
121
|
+
Wisdom\u00a0Library**\n\nA knowledge graph, also referred to as a semantic
|
122
|
+
network, is a\nstructure that represents a network of real-world entities
|
123
|
+
such as\nobjects, events, situations, or concepts. It helps to illustrate
|
124
|
+
the\nconstantly changing representations of the world, connecting entities\n(such
|
125
|
+
as \"Marie Curie\") and relationships (such as \"won Nobel Prize\") to\nform
|
126
|
+
a complex network of information. This information is typically\nstored in
|
127
|
+
a graph database and visualized as a graph structure, thus the\nterm knowledge
|
128
|
+
\"graph\".\n\nKGs go beyond simply finding relevant facts and delve deeper
|
129
|
+
into\nunderstanding the relationships and insights hidden within using these\nprocesses:\n\n**Entity
|
130
|
+
Linking**: Imagine a vast network of information, like a big\npuzzle of dots.
|
131
|
+
Now imagine trying to connect specific names, places,\nand concepts to their
|
132
|
+
corresponding dots in the puzzle. That is what\nentity linking does with text
|
133
|
+
and knowledge graphs, connecting the\nspecific components of the text to the
|
134
|
+
corresponding nodes in the graph.\nThey help systems understand the exact
|
135
|
+
meaning of entities, and find\nrelevant information from the\u00a0graph.\n\nLibraries
|
136
|
+
like [**DGL-KeLP**](https://github.com/awslabs/dgl-ke)\nleverage GNNs to identify
|
137
|
+
and link named entities (like \"Marie Curie\")\nto their respective nodes
|
138
|
+
within the Knowledge Graphs, enabling RAG to\nretrieve information that is
|
139
|
+
directly relevant to the core subject of a\nsearch\u00a0query\n\n**Path Mining**:
|
140
|
+
Path mining is a process of uncovering hidden\nrelationships and patterns
|
141
|
+
that are not easily noticeable. It involves\nexploring complicated networks
|
142
|
+
of information and identifying and\ntracing connections between entities that
|
143
|
+
may seem unrelated. By doing\nso, path mining reveals surprising insights
|
144
|
+
and useful knowledge,\nimproving our understanding of the complex structures
|
145
|
+
within knowledge\ngraphs.\n\nTools like [**Neo4j**](https://neo4j.com/) and\n[**Stanza**](https://github.com/stanfordnlp/stanza)
|
146
|
+
allow traversing\npaths between entities, uncovering hidden relationships,
|
147
|
+
and generating\ninsightful responses based on this deeper understanding.\n\n**Reasoning
|
148
|
+
and Inference**: In the context of knowledge graphs,\nreasoning and inference
|
149
|
+
are not just limited to discovering facts; they\nare also concerned with utilizing
|
150
|
+
them effectively. This involves\nintegrating data, drawing meaningful connections,
|
151
|
+
and using logical\nreasoning to resolve issues, foresee future occurrences,
|
152
|
+
or even\nconstruct narratives leveraging the insights provided by the knowledge\ngraph.\n\nConsider
|
153
|
+
the scenario of trying to find an organization that works in\nspecific sectors
|
154
|
+
with the help of a knowledge graph. This analogy\neffectively highlights the
|
155
|
+
active role of reasoning and inference in\nknowledge graphs:\n\n1. Gathering
|
156
|
+
Facts: Knowledge graphs collect and organize information\n from various
|
157
|
+
sources, such as websites, databases, academic papers,\n and social media
|
158
|
+
platforms. These facts are represented as\n structured data, with entities
|
159
|
+
(e.g., organizations) and their\n attributes (e.g., sectors in which they
|
160
|
+
operate) forming nodes and\n edges in the graph. By combining data about
|
161
|
+
organizations and\n sectors, knowledge graphs enable the gathering of relevant
|
162
|
+
facts for\n analysis.\n2. Integrating information: By connecting an organization''s\n relationships
|
163
|
+
with specific sectors, such as partnerships,\n investments, or certifications,
|
164
|
+
knowledge graphs reveal the scope\n and relevance of their work within
|
165
|
+
those sectors. Links to related\n entities like employees, board members,
|
166
|
+
or projects can further\n contribute to understanding an organization''s
|
167
|
+
involvement in\n specific\u00a0sectors.\n3. Predicting and Creating: Knowledge
|
168
|
+
graphs can leverage machine\n learning and predictive models to infer missing
|
169
|
+
or hidden\n information. By analyzing the available facts and connections
|
170
|
+
within\n the graph, these models can predict an organization''s potential\n involvement
|
171
|
+
in sectors that have common attributes with their known\n areas of operation.
|
172
|
+
For example, if an organization has expertise in\n renewable energy, predictive
|
173
|
+
models could suggest their likely\n involvement in related sectors like
|
174
|
+
clean transportation or\n sustainable infrastructure. Additionally, knowledge
|
175
|
+
graphs\n facilitate the creation of new information and insights by combining\n existing
|
176
|
+
facts with external data sources. For instance, by\n integrating real-time
|
177
|
+
data on industry trends, market analysis, or\n news articles, knowledge
|
178
|
+
graphs enable the discovery of emerging\n sectors or upcoming organizations
|
179
|
+
that might align with the given\n parameters.\n\nA framework like [**Atomspace**](https://github.com/opencog/atomspace)\nfrom
|
180
|
+
[**OpenCog**](https://opencog.org/) empowers RAG to reason and\ninfer new
|
181
|
+
knowledge. By traversing paths and combining information from\ninterconnected
|
182
|
+
entities, the system can generate informed predictions or\nanswer hypothetical
|
183
|
+
questions.\n\n### Purpose\n\nThe combination of Retrieval-Augmented Generation
|
184
|
+
(RAG) and Knowledge\nGraphs (KG) is beneficial for several\u00a0reasons:\n\n1. **Enhanced
|
185
|
+
information retrieval**: Knowledge graphs provide\n structured and interconnected
|
186
|
+
information that can significantly\n improve the effectiveness of information
|
187
|
+
retrieval. By using KGs,\n RAG models can retrieve more accurate and relevant
|
188
|
+
information,\n leading to better generation and response\u00a0quality.\n2. **Reliable
|
189
|
+
and diverse information:** KGs are constructed from\n authoritative sources,
|
190
|
+
making them reliable and trustworthy sources\n of information. RAG models
|
191
|
+
can leverage this reliable information to\n generate more accurate responses.
|
192
|
+
Additionally, KGs help in\n diversifying the generated responses by providing
|
193
|
+
a broader pool of\n related facts and entities.\n3. **Context-aware understanding**:
|
194
|
+
KGs enable RAG models to understand\n and reason over the contextual information.
|
195
|
+
By leveraging the\n relationships and semantic connections encoded in KGs,
|
196
|
+
RAG models\n can better grasp the context of user queries or conversations,\n resulting
|
197
|
+
in more coherent and appropriate responses.\n4. **Handling complex queries**:
|
198
|
+
KGs allow RAG models to tackle complex\n queries by breaking them down
|
199
|
+
into smaller sub-queries, retrieving\n relevant pieces of information from
|
200
|
+
the KG, and then generating a\n response based on the retrieved knowledge.
|
201
|
+
This enables RAG models\n to handle a wide range of user queries effectively.\n5. **Explainability
|
202
|
+
and transparency**: KGs provide a transparent and\n interpretable representation
|
203
|
+
of knowledge. By integrating KG-based\n retrieval into RAG models, the
|
204
|
+
reasoning behind the generated\n responses becomes more explainable. Users
|
205
|
+
can have a clear\n understanding of the knowledge sources and connections
|
206
|
+
used to\n produce the response.\n6. **Scalability**: Knowledge graphs
|
207
|
+
act as large-scale repositories of\n information. RAG models can leverage
|
208
|
+
KGs to generate responses to\n various queries or conversations without
|
209
|
+
requiring additional\n supervised training data. This makes the RAG+KG
|
210
|
+
approach scalable to\n handle an extensive range of knowledge domains and
|
211
|
+
user\u00a0queries.\n\n### **Pipeline Possibilities: Orchestrating RAG and\u00a0KGs:**\n\nLet''s
|
212
|
+
explore some exciting pipeline options for harnessing the combined\npower
|
213
|
+
of RAG and Knowledge Graphs. There are two options in which either\nthe LLM
|
214
|
+
is prioritized or the Knowledge Graph is prioritized:\n\n**Option 1: LLM-Centric
|
215
|
+
Pipeline:**\n\nThe LLM-Centric pipeline is a RAG and Knowledge Graph combination
|
216
|
+
that\nempowers LLMs to craft well-informed responses. Here''s how it\u00a0works:\n\n1. Start
|
217
|
+
with the user''s question or statement\n2. The LLM (like GPT-3) generates
|
218
|
+
an initial draft response based on\n its internal knowledge. This draft
|
219
|
+
may lack specific factual details\n or nuances that a knowledge graph can\u00a0provide.\n3. RAG
|
220
|
+
kicks in, searching the text corpus or the Knowledge Graph for\n relevant
|
221
|
+
passages that enrich the draft. During the retrieval\n process, RAG retrieval
|
222
|
+
techniques are used to search not only text\n corpora but also knowledge
|
223
|
+
graphs to find relevant information. This\n means that RAG can directly
|
224
|
+
tap into the structured knowledge within\n the graph to retrieve facts,
|
225
|
+
relationships, and entities that align\n with the user''s query and the
|
226
|
+
LLM''s generated draft.\n4. The retrieved information is carefully fused
|
227
|
+
with the LLM''s output,\n creating a more factually accurate and insightful
|
228
|
+
response\n5. A final polishing step ensures the response is fluent, grammatically\n correct,
|
229
|
+
and ready to\u00a0show.\n\n<figure>\n<img\nsrc=\"https://cdn-images-1.medium.com/max/1024/0*3pd9MOIflkbS07wI\"
|
230
|
+
/>\n<figcaption>RAG LLM-centric generic\u00a0scheme.</figcaption>\n</figure>\n\nThe
|
231
|
+
basic steps to perform this\u00a0are:\n\n1. **Pre-processing**: Clean and
|
232
|
+
tokenize user input to prepare for\n processing.\n2. **LLM Generation**:
|
233
|
+
Generate an initial draft response using an LLM\n like [**GPT-3**](https://openai.com/product)
|
234
|
+
or [**Jurassic-1\n Jumbo**](https://www.livescience.com/google-sentient-ai-lamda-lemoine).\n3. **Retrieval**:
|
235
|
+
Employ RAG techniques to retrieve relevant passages\n from a text corpus
|
236
|
+
or Knowledge Graphs.\n4. **Fusion**: Integrate retrieved information into
|
237
|
+
the LLM-generated\n draft, creating a more informed and factually-grounded
|
238
|
+
response.\n5. **Post-processing**: Refine the final response for fluency,\n grammatical
|
239
|
+
correctness, and overall coherence.\n\n**Option 2: Knowledge Graphs-Centric
|
240
|
+
Pipeline:**\n\nIn this approach, knowledge graphs take center stage. In essence,
|
241
|
+
this\npipeline prioritizes the structured knowledge within knowledge graphs,\nusing
|
242
|
+
RAG retrieval techniques to translate those insights into\ncompelling and
|
243
|
+
informative language. Here''s how it\u00a0unfolds:\n\n1. User input: The
|
244
|
+
process begins with the user''s question or statement\n2. Graph exploration:
|
245
|
+
The knowledge graph is meticulously explored to\n identify relevant entities,
|
246
|
+
relationships, and paths that align with\n the user''s input. This stage
|
247
|
+
involves techniques like entity\n linking, path mining, and reasoning to
|
248
|
+
uncover valuable information\n within the\u00a0graph\n3. Response planning:
|
249
|
+
The insights extracted from the graph are used to\n create a structured
|
250
|
+
response plan. This plan outlines the key\n points, facts, and logical
|
251
|
+
flow that the final response\n should\u00a0embody\n4. Language generation:
|
252
|
+
This is where RAG steps in. Its purpose is to\n create human-like text
|
253
|
+
that follows the response plan. It uses LLMs\n to produce well-written
|
254
|
+
sentences and paragraphs, combining the\n relevant information from the
|
255
|
+
knowledge graph while maintaining\n cohesiveness and readability.\n5. Post-processing:
|
256
|
+
The generated response undergoes a final refinement\n process to ensure
|
257
|
+
grammatical correctness, clarity, and\n overall\u00a0quality\n\n<figure>\n<img\nsrc=\"https://cdn-images-1.medium.com/max/1024/0*mZ83esKBjbPmCq_C\"
|
258
|
+
/>\n<figcaption>RAG Knowledge Graph-centric generic\u00a0scheme.</figcaption>\n</figure>\n\nThe
|
259
|
+
basic steps\u00a0are:\n\n1. **Query Formulation**: Transform the user input
|
260
|
+
into a query\n suitable for Knowledge Graph''s exploration.\n2. **Knowledge
|
261
|
+
Graphs:** You can use either Neo4j or\n [NebulaGraph](https://www.nebula-graph.io/)
|
262
|
+
to implement a retrieval\n enhancement technique. This technique involves
|
263
|
+
utilizing a knowledge\n graph to illustrate the connections between entities
|
264
|
+
and\n relationships. Additionally, it incorporates a powerful language\n model
|
265
|
+
to improve the retrieval process.\n3. **Fact Selection**: Employ entity linking
|
266
|
+
and reasoning algorithms\n to select and prioritize the most relevant facts
|
267
|
+
based on the query\n and\u00a0context.\n4. **Natural Language Generation**
|
268
|
+
(**NLG**): Utilise specialized NLG\n models like\n [BART](https://research.facebook.com/publications/controllable-abstractive-summarization/)\n to
|
269
|
+
translate the extracted facts into a natural language response.\n5. **Refinement**:
|
270
|
+
Enhance the generated response for clarity and\n coherence.\n\n### **Unveiling
|
271
|
+
a Future of Intelligent Interaction**\n\nThe combination of RAG and Knowledge
|
272
|
+
Graphs goes beyond just being a\ntechnological fusion. It paves the way for
|
273
|
+
a future where the\ninteraction between humans and computers goes beyond simple
|
274
|
+
words and\nbecomes a more informed and refined form of communication. As these\ntechnologies
|
275
|
+
continue to develop, we can expect to witness a significant\ntransformation
|
276
|
+
in:\n\n- AI-powered assistants that answer your questions with the confidence\n of
|
277
|
+
a well-read friend, seamlessly combining relevant facts and\n insights gleaned
|
278
|
+
from Knowledge Graphs.\n- Next-generation search engines that go beyond keyword
|
279
|
+
matching,\n understanding the deeper meaning behind your queries and delivering\n comprehensive,
|
280
|
+
contextual results enriched with information from\n Knowledge Graphs.\n-
|
281
|
+
Creative writing tools that utilize RAG and Knowledge Graphs to\n generate
|
282
|
+
stories that are both factually accurate and full of\n unexpected plot twists
|
283
|
+
and character development, moving beyond\n clich\u00e9d patterns.\n\n###
|
284
|
+
**Conclusion**\n\nThe convergence of Retrieval Augmented Generation (RAG)
|
285
|
+
and Knowledge\nGraphs (KGs) brings about an exciting synergy in the world
|
286
|
+
of Natural\nLanguage Processing (NLP). RAG enhances the output of large language\nmodels
|
287
|
+
by carefully selecting relevant information from external sources\nand KGs,
|
288
|
+
allowing for well-informed and detailed responses. KGs, on the\nother hand,
|
289
|
+
provide a structured representation of real-world entities\nand their relationships,
|
290
|
+
enabling the exploration of hidden insights and\nthe discovery of complex
|
291
|
+
connections.\n\nThe integration of RAG and KGs opens up two pipeline possibilities.
|
292
|
+
The\nLLM-centric pipeline prioritizes the language model''s output, which
|
293
|
+
is\nthen enriched with information retrieved from KGs. The Knowledge\nGraphs-centric
|
294
|
+
pipeline, on the other hand, places KGs at the center,\nutilizing RAG techniques
|
295
|
+
to translate the structured insights into\ncompelling and informative language.\n\nWhile
|
296
|
+
integrating LLMs and a knowledge graph for content retrieval\nrequires careful
|
297
|
+
planning, the reward is significant. You can gain\naccess to hidden relationships
|
298
|
+
within information, ultimately leading to\nhigher-quality output information.\n\nTools
|
299
|
+
like **OpenAI**, **Langchain**, and **LlamaIndex** provide\nready-made pipelines
|
300
|
+
to integrate knowledge graphs (like **Neo4j**)\neasily. Meanwhile, open-source
|
301
|
+
LLMs like **Mistral**, **Llama**, and\n**Dolphin** are catching up to proprietary
|
302
|
+
models in performance, making\nthem attractive choices for building custom
|
303
|
+
architectures. This\nopen-source scenario allows for the exploration and examination
|
304
|
+
of\nvarious methods before fully committing to a particular technological\nframework.
|
305
|
+
So, it is crucial to evaluate your needs and choose the\napproach that best
|
306
|
+
fits your use\u00a0case.\n\n![](https://medium.com/_/stat?event=post.clientViewed&referrerSource=full_rss&postId=fc0a6900f7eb){width=\"1\"\nheight=\"1\"}\n","doi":"https://doi.org/10.59350/jhrs4-22440","guid":"https://medium.com/p/fc0a6900f7eb","id":"05f01f68-ef81-47d7-a3c1-40aba91d358f","image":"https://cdn-images-1.medium.com/max/1024/1*bJ3eWZ7301vYDzBomwdLfQ.png","indexed_at":1706690571,"language":"en","published_at":1705557796,"reference":[],"relationships":[],"summary":"<strong>\n
|
307
|
+
Tools and Platform for Integration of Knowledge Graph with RAG pipelines.\n</strong>\nAuthors:
|
308
|
+
Aland Astudillo, Aishwarya Nambissan Many users of chatbots such as ChatGPT,
|
309
|
+
have encountered the problem of receiving inappropriate or incompatible responses.
|
310
|
+
There are several reasons why this might\u00a0happen. One reason is the lack
|
311
|
+
of appropriate training data, as chatbots are usually trained on large amounts
|
312
|
+
of text and code.","tags":["Artificial-intelligence","Machine-learning","Retrieval-augmented","Knowledge-graph"],"title":"Unveiling
|
313
|
+
the Synergy: Retrieval Augmented Generation (RAG) Meets Knowledge Graphs","updated_at":1705557796,"url":"https://medium.com/@researchgraph/unveiling-the-synergy-retrieval-augmented-generation-rag-meets-knowledge-graphs-fc0a6900f7eb"}
|
314
|
+
|
315
|
+
'
|
316
|
+
recorded_at: Wed, 31 Jan 2024 19:50:01 GMT
|
317
|
+
recorded_with: VCR 6.2.0
|
@@ -90,7 +90,7 @@ describe Commonmeta::Metadata, vcr: true do
|
|
90
90
|
"affiliation" => [{ "name" => "Тверская государственная сельскохозяйственная академия" }], "familyName" => "Ганичева", "givenName" => "А.В.", "type" => "Person", "contributorRoles" => ["Author"],
|
91
91
|
)
|
92
92
|
expect(subject.titles.last).to eq("title" => "MODEL OF SYSTEM DYNAMICS OF PROCESS OF TRAINING",
|
93
|
-
"
|
93
|
+
"type" => "TranslatedTitle")
|
94
94
|
expect(subject.date).to eq("created" => "2019-02-12", "published" => "2019",
|
95
95
|
"registered" => "2019-02-12", "updated" => "2022-08-23")
|
96
96
|
expect(subject.publisher).to eq("name" => "МОДЕЛИРОВАНИЕ, ОПТИМИЗАЦИЯ И ИНФОРМАЦИОННЫЕ ТЕХНОЛОГИИ")
|
@@ -114,10 +114,14 @@ describe Commonmeta::Metadata, vcr: true do
|
|
114
114
|
expect(subject.contributors.first).to eq(
|
115
115
|
"name" => "Europäische Kommission", "contributorRoles" => ["Author"], "type" => "Organization",
|
116
116
|
)
|
117
|
-
expect(subject.titles).to eq([
|
118
|
-
|
119
|
-
|
120
|
-
|
117
|
+
expect(subject.titles).to eq([{ "language" => "de", "title" => "Flash Eurobarometer 54 (Madrid Summit)" },
|
118
|
+
{ "language" => "en", "title" => "Flash Eurobarometer 54 (Madrid Summit)" },
|
119
|
+
{ "language" => "de",
|
120
|
+
"title" => "The Common European Currency",
|
121
|
+
"type" => "Subtitle" },
|
122
|
+
{ "language" => "en",
|
123
|
+
"title" => "The Common European Currency",
|
124
|
+
"type" => "Subtitle" }])
|
121
125
|
expect(subject.subjects).to eq([{ "lang" => "en",
|
122
126
|
"subject" => "KAT12 International Institutions, Relations, Conditions",
|
123
127
|
"subjectScheme" => "ZA" },
|
@@ -163,14 +167,39 @@ describe Commonmeta::Metadata, vcr: true do
|
|
163
167
|
expect(subject.contributors.length).to eq(23)
|
164
168
|
expect(subject.contributors[0]).to eq("contributorRoles" => ["Author"], "familyName" => "ExampleFamilyName", "givenName" => "ExampleGivenName", "type" => "Person")
|
165
169
|
expect(subject.contributors[2]).to eq("contributorRoles" => ["ContactPerson"], "familyName" => "ExampleFamilyName", "givenName" => "ExampleGivenName", "type" => "Person")
|
166
|
-
expect(subject.date).to eq("created"=>"2022-10-27", "published"=>"2022", "registered"=>"2022-10-27", "updated"=>"2024-01-02")
|
170
|
+
expect(subject.date).to eq("created" => "2022-10-27", "published" => "2022", "registered" => "2022-10-27", "updated" => "2024-01-02")
|
167
171
|
expect(subject.publisher).to eq("name" => "Example Publisher")
|
168
|
-
expect(subject.
|
172
|
+
expect(subject.titles).to eq([{ "language" => "en", "title" => "Example Title" },
|
173
|
+
{ "language" => "en", "title" => "Example Subtitle", "type" => "Subtitle" },
|
174
|
+
{ "language" => "fr",
|
175
|
+
"title" => "Example TranslatedTitle",
|
176
|
+
"type" => "TranslatedTitle" },
|
177
|
+
{ "language" => "en",
|
178
|
+
"title" => "Example AlternativeTitle",
|
179
|
+
"type" => "AlternativeTitle" }])
|
180
|
+
expect(subject.descriptions).to eq([{ "description" => "Example Abstract",
|
181
|
+
"type" => "Abstract",
|
182
|
+
"language" => "en" },
|
183
|
+
{ "description" => "Example Methods",
|
184
|
+
"type" => "Methods",
|
185
|
+
"language" => "en" },
|
186
|
+
{ "description" => "Example SeriesInformation",
|
187
|
+
"type" => "Other",
|
188
|
+
"language" => "en" },
|
189
|
+
{ "description" => "Example TableOfContents",
|
190
|
+
"type" => "Other",
|
191
|
+
"language" => "en" },
|
192
|
+
{ "description" => "Example TechnicalInfo",
|
193
|
+
"type" => "TechnicalInfo",
|
194
|
+
"language" => "en" },
|
195
|
+
{ "description" => "Example Other", "type" => "Other", "language" => "en" }])
|
196
|
+
expect(subject.license).to eq("id" => "CC-PDDC", "url" => "https://creativecommons.org/licenses/publicdomain/")
|
169
197
|
end
|
170
198
|
|
171
199
|
it "instrument" do
|
172
200
|
input = "#{fixture_path}datacite-instrument.json"
|
173
201
|
subject = described_class.new(input: input)
|
202
|
+
puts subject.errors unless subject.valid?
|
174
203
|
expect(subject.valid?).to be true
|
175
204
|
expect(subject.id).to eq("https://doi.org/10.82433/08qf-ee96")
|
176
205
|
expect(subject.type).to eq("Instrument")
|
@@ -189,6 +189,31 @@ describe Commonmeta::Metadata, vcr: true do
|
|
189
189
|
expect(subject.references).to be_nil
|
190
190
|
end
|
191
191
|
|
192
|
+
it "medium post with institutional author" do
|
193
|
+
input = "https://api.rogue-scholar.org/posts/05f01f68-ef81-47d7-a3c1-40aba91d358f"
|
194
|
+
subject = described_class.new(input: input)
|
195
|
+
# expect(subject.valid?).to be true
|
196
|
+
expect(subject.id).to eq("https://doi.org/10.59350/jhrs4-22440")
|
197
|
+
expect(subject.url).to eq("https://medium.com/@researchgraph/unveiling-the-synergy-retrieval-augmented-generation-rag-meets-knowledge-graphs-fc0a6900f7eb")
|
198
|
+
expect(subject.alternate_identifiers).to eq([{ "alternateIdentifier" => "05f01f68-ef81-47d7-a3c1-40aba91d358f", "alternateIdentifierType" => "UUID" }])
|
199
|
+
expect(subject.type).to eq("Article")
|
200
|
+
expect(subject.contributors.length).to eq(1)
|
201
|
+
expect(subject.contributors.first).to eq("contributorRoles"=>["Author"], "name"=>"Research Graph", "type"=>"Organization")
|
202
|
+
expect(subject.titles).to eq([{ "title" => "Unveiling the Synergy: Retrieval Augmented Generation (RAG) Meets Knowledge Graphs" }])
|
203
|
+
expect(subject.license).to eq("id" => "CC-BY-4.0",
|
204
|
+
"url" => "https://creativecommons.org/licenses/by/4.0/legalcode")
|
205
|
+
expect(subject.date).to eq("published"=>"2024-01-18", "updated"=>"2024-01-18")
|
206
|
+
expect(subject.descriptions.first["description"]).to start_with("<strong> Tools and Platform for Integration of Knowledge Graph with RAG pipelines.")
|
207
|
+
expect(subject.publisher).to eq("name" => "Research Graph")
|
208
|
+
expect(subject.subjects).to eq([{ "subject" => "Computer and information sciences" },
|
209
|
+
{ "schemeUri" => "http://www.oecd.org/science/inno/38235147.pdf",
|
210
|
+
"subject" => "FOS: Computer and information sciences",
|
211
|
+
"subjectScheme" => "Fields of Science and Technology (FOS)" }])
|
212
|
+
expect(subject.language).to eq("en")
|
213
|
+
expect(subject.container).to eq("identifier" => "https://medium.com/@researchgraph", "identifierType" => "URL", "title" => "Research Graph", "type" => "Periodical")
|
214
|
+
expect(subject.references).to be_nil
|
215
|
+
end
|
216
|
+
|
192
217
|
it "syldavia gazette post with references" do
|
193
218
|
input = "https://api.rogue-scholar.org/posts/0022b9ef-525a-4a79-81ad-13411697f58a"
|
194
219
|
subject = described_class.new(input: input)
|
@@ -33,11 +33,38 @@ describe Commonmeta::Metadata, vcr: true do
|
|
33
33
|
"volume" => "426",
|
34
34
|
"firstPage" => "181",
|
35
35
|
"containerTitle" => "Nature")
|
36
|
-
expect(json["date"]).to eq("published"=>"2014-02-11", "updated"=>"2022-03-26")
|
36
|
+
expect(json["date"]).to eq("published" => "2014-02-11", "updated" => "2022-03-26")
|
37
37
|
expect(json["descriptions"].first["description"]).to start_with("Among various advantages,")
|
38
|
-
expect(json["license"]).to eq("id"=>"CC-BY-3.0", "url"=>"https://creativecommons.org/licenses/by/3.0/legalcode")
|
38
|
+
expect(json["license"]).to eq("id" => "CC-BY-3.0", "url" => "https://creativecommons.org/licenses/by/3.0/legalcode")
|
39
39
|
expect(json["provider"]).to eq("Crossref")
|
40
|
-
expect(json["files"].first).to eq("mimeType"=>"application/pdf", "url"=>"https://cdn.elifesciences.org/articles/01567/elife-01567-v1.pdf")
|
40
|
+
expect(json["files"].first).to eq("mimeType" => "application/pdf", "url" => "https://cdn.elifesciences.org/articles/01567/elife-01567-v1.pdf")
|
41
|
+
end
|
42
|
+
|
43
|
+
it "dataset schema v4.5" do
|
44
|
+
input = "#{fixture_path}datacite-dataset_v4.5.json"
|
45
|
+
subject = described_class.new(input: input)
|
46
|
+
expect(subject.id).to eq("https://doi.org/10.82433/b09z-4k37")
|
47
|
+
json = JSON.parse(subject.commonmeta)
|
48
|
+
expect(json["id"]).to eq("https://doi.org/10.82433/b09z-4k37")
|
49
|
+
expect(json["type"]).to eq("Dataset")
|
50
|
+
expect(json["titles"]).to eq([{ "language" => "en", "title" => "Example Title" },
|
51
|
+
{ "language" => "en", "title" => "Example Subtitle", "type" => "Subtitle" },
|
52
|
+
{ "language" => "fr",
|
53
|
+
"title" => "Example TranslatedTitle",
|
54
|
+
"type" => "TranslatedTitle" },
|
55
|
+
{ "language" => "en",
|
56
|
+
"title" => "Example AlternativeTitle",
|
57
|
+
"type" => "AlternativeTitle" }])
|
58
|
+
expect(json["descriptions"]).to eq([{ "description" => "Example Abstract", "language" => "en", "type" => "Abstract" },
|
59
|
+
{ "description" => "Example Methods", "language" => "en", "type" => "Methods" },
|
60
|
+
{ "description" => "Example SeriesInformation",
|
61
|
+
"language" => "en",
|
62
|
+
"type" => "Other" },
|
63
|
+
{ "description" => "Example TableOfContents", "language" => "en", "type" => "Other" },
|
64
|
+
{ "description" => "Example TechnicalInfo",
|
65
|
+
"language" => "en",
|
66
|
+
"type" => "TechnicalInfo" },
|
67
|
+
{ "description" => "Example Other", "language" => "en", "type" => "Other" }])
|
41
68
|
end
|
42
69
|
end
|
43
70
|
end
|
@@ -7,6 +7,7 @@ describe Commonmeta::Metadata, vcr: true do
|
|
7
7
|
it 'Dataset' do
|
8
8
|
input = 'https://doi.org/10.5061/DRYAD.8515'
|
9
9
|
subject = described_class.new(input: input, from: 'datacite')
|
10
|
+
puts subject.errors unless subject.valid?
|
10
11
|
expect(subject.valid?).to be true
|
11
12
|
json = JSON.parse(subject.csl)
|
12
13
|
expect(json['type']).to eq('dataset')
|
@@ -37,6 +37,7 @@ describe Commonmeta::Metadata, vcr: true do
|
|
37
37
|
it 'text' do
|
38
38
|
input = 'https://doi.org/10.3204/desy-2014-01645'
|
39
39
|
subject = described_class.new(input: input, from: 'datacite')
|
40
|
+
puts subject.errors unless subject.valid?
|
40
41
|
expect(subject.valid?).to be true
|
41
42
|
csv = subject.csv.parse_csv
|
42
43
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: commonmeta-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.12.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Martin Fenner
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-02-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -694,7 +694,7 @@ files:
|
|
694
694
|
- lib/commonmeta/xml_converter.rb
|
695
695
|
- resources/2008/09/xsd.xsl
|
696
696
|
- resources/cff.json
|
697
|
-
- resources/commonmeta_v0.10.
|
697
|
+
- resources/commonmeta_v0.10.7.json
|
698
698
|
- resources/crossref/AccessIndicators.xsd
|
699
699
|
- resources/crossref/JATS-journalpublishing1-3d2-mathml3-elements.xsd
|
700
700
|
- resources/crossref/JATS-journalpublishing1-3d2-mathml3.xsd
|
@@ -921,6 +921,7 @@ files:
|
|
921
921
|
- spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/ghost_post_without_doi.yml
|
922
922
|
- spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/jekyll_post.yml
|
923
923
|
- spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/jekyll_post_with_anonymous_author.yml
|
924
|
+
- spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/medium_post_with_institutional_author.yml
|
924
925
|
- spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/substack_post_with_broken_reference.yml
|
925
926
|
- spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/syldavia_gazette_post_with_references.yml
|
926
927
|
- spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/upstream_post_with_references.yml
|