commonmeta-ruby 3.11.0 → 3.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +5 -4
- data/lib/commonmeta/author_utils.rb +9 -6
- data/lib/commonmeta/readers/datacite_reader.rb +117 -111
- data/lib/commonmeta/schema_utils.rb +1 -1
- data/lib/commonmeta/version.rb +1 -1
- data/lib/commonmeta/writers/commonmeta_writer.rb +1 -1
- data/resources/{commonmeta_v0.10.6.json → commonmeta_v0.10.7.json} +11 -3
- data/spec/author_utils_spec.rb +10 -0
- data/spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/medium_post_with_institutional_author.yml +317 -0
- data/spec/readers/datacite_reader_spec.rb +36 -7
- data/spec/readers/json_feed_reader_spec.rb +25 -0
- data/spec/writers/commonmeta_writer_spec.rb +30 -3
- data/spec/writers/csl_writer_spec.rb +1 -0
- data/spec/writers/csv_writer_spec.rb +1 -0
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5acc4ac253ffc536724d14bf4d6cc58710978da3587c21035d946889b454a959
|
4
|
+
data.tar.gz: c1eac95196a7e2f01b52c5f6f94e0055299554e59824dfc88bda9de661c34193
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 95b8264ab1e837f26971d12df81ec4b3fc156d21d63387b0053646aacf80f8be6b03d77837759d2876ecd35c05400114a8c559df45219c74160422e58b73d868
|
7
|
+
data.tar.gz: 1cca6f5bfa1bd30d966744931a88054cea207f4ed50f1c94b5a428e656be4f7216b71055f9e66d2ac03cb60b2140e3610f1a8c60c5bd26d4644daa0d4ece0b13
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
commonmeta-ruby (3.
|
4
|
+
commonmeta-ruby (3.12.0)
|
5
5
|
activesupport (>= 4.2.5, < 8.0)
|
6
6
|
addressable (~> 2.8.1, < 2.8.2)
|
7
7
|
base32-url (>= 0.7.0, < 1)
|
@@ -58,7 +58,8 @@ GEM
|
|
58
58
|
rubocop (~> 1.0)
|
59
59
|
concurrent-ruby (1.2.3)
|
60
60
|
connection_pool (2.4.1)
|
61
|
-
crack (0.4.
|
61
|
+
crack (0.4.6)
|
62
|
+
bigdecimal
|
62
63
|
rexml
|
63
64
|
crass (1.0.6)
|
64
65
|
csl (2.0.0)
|
@@ -66,7 +67,7 @@ GEM
|
|
66
67
|
rexml
|
67
68
|
csl-styles (2.0.1)
|
68
69
|
csl (~> 2.0)
|
69
|
-
diff-lcs (1.5.
|
70
|
+
diff-lcs (1.5.1)
|
70
71
|
docile (1.4.0)
|
71
72
|
domain_name (0.6.20240107)
|
72
73
|
drb (2.2.0)
|
@@ -154,7 +155,7 @@ GEM
|
|
154
155
|
iniparser (>= 0.1.0)
|
155
156
|
public_suffix (4.0.7)
|
156
157
|
racc (1.7.3)
|
157
|
-
rack (3.0.
|
158
|
+
rack (3.0.9)
|
158
159
|
rack-test (2.1.0)
|
159
160
|
rack (>= 1.3)
|
160
161
|
rainbow (3.1.1)
|
@@ -25,8 +25,8 @@ module Commonmeta
|
|
25
25
|
"Researcher" => "Other",
|
26
26
|
"Sponsor" => "Other",
|
27
27
|
"Supervisor" => "Supervision",
|
28
|
-
"WorkPackageLeader" => "Other"
|
29
|
-
}
|
28
|
+
"WorkPackageLeader" => "Other",
|
29
|
+
}
|
30
30
|
|
31
31
|
def get_one_author(author)
|
32
32
|
# basic sanity checks
|
@@ -55,20 +55,20 @@ module Commonmeta
|
|
55
55
|
parse_attributes(author.fetch("identifier", nil), first: true) ||
|
56
56
|
parse_attributes(author.fetch("sameAs", nil), first: true)
|
57
57
|
id = normalize_orcid(id) || normalize_ror(id) if id.present?
|
58
|
-
|
58
|
+
|
59
59
|
# DataCite metadata
|
60
60
|
if id.nil? && author["nameIdentifiers"].present?
|
61
61
|
id = Array.wrap(author.dig("nameIdentifiers")).find do |ni|
|
62
62
|
normalize_name_identifier(ni).present?
|
63
63
|
end
|
64
64
|
id = normalize_name_identifier(id) if id.present?
|
65
|
-
|
65
|
+
# Crossref metadata
|
66
66
|
elsif id.nil? && author["ORCID"].present?
|
67
67
|
id = author.fetch("ORCID")
|
68
68
|
id = normalize_orcid(id)
|
69
|
-
|
69
|
+
# JSON Feed metadata
|
70
70
|
elsif id.nil? && author["url"].present?
|
71
|
-
id = author.fetch("url")
|
71
|
+
id = author.fetch("url")
|
72
72
|
end
|
73
73
|
|
74
74
|
# parse author type, i.e. "Person", "Organization" or not specified
|
@@ -168,6 +168,9 @@ module Commonmeta
|
|
168
168
|
# check if a name has only one word, e.g. "FamousOrganization", not including commas
|
169
169
|
return false if name.to_s.split(" ").size == 1 && name.to_s.exclude?(",")
|
170
170
|
|
171
|
+
# check if name contains words known to be used in organization names
|
172
|
+
return false if %w[University College Institute School Center Department Laboratory Library Museum Foundation Society Association Company Corporation Collaboration Consortium Incorporated Inc. Institut Research Science].any? { |word| name.to_s.include?(word) }
|
173
|
+
|
171
174
|
# check for suffixes, e.g. "John Smith, MD"
|
172
175
|
return true if name && %w[MD PhD].include?(name.split(", ").last)
|
173
176
|
|
@@ -4,29 +4,29 @@ module Commonmeta
|
|
4
4
|
module Readers
|
5
5
|
module DataciteReader
|
6
6
|
def get_datacite(id: nil, **options)
|
7
|
-
return {
|
7
|
+
return { "string" => nil, "state" => "not_found" } unless id.present?
|
8
8
|
|
9
9
|
api_url = datacite_api_url(id, options)
|
10
10
|
response = HTTP.get(api_url)
|
11
|
-
return {
|
11
|
+
return { "string" => nil, "state" => "not_found" } unless response.status.success?
|
12
12
|
|
13
13
|
body = JSON.parse(response.body)
|
14
|
-
client = Array.wrap(body.fetch(
|
15
|
-
m[
|
14
|
+
client = Array.wrap(body.fetch("included", nil)).find do |m|
|
15
|
+
m["type"] == "clients"
|
16
16
|
end
|
17
|
-
client_id = client.to_h.fetch(
|
18
|
-
provider_id = Array.wrap(client.to_h.fetch(
|
19
|
-
m[
|
20
|
-
end.to_h.dig(
|
21
|
-
|
22
|
-
{
|
23
|
-
|
24
|
-
|
17
|
+
client_id = client.to_h.fetch("id", nil)
|
18
|
+
provider_id = Array.wrap(client.to_h.fetch("relationships", nil)).find do |m|
|
19
|
+
m["provider"].present?
|
20
|
+
end.to_h.dig("provider", "data", "id")
|
21
|
+
|
22
|
+
{ "string" => response.body.to_s,
|
23
|
+
"provider_id" => provider_id,
|
24
|
+
"client_id" => client_id }
|
25
25
|
end
|
26
26
|
|
27
27
|
def read_datacite(string: nil, **_options)
|
28
28
|
errors = jsonlint(string)
|
29
|
-
return {
|
29
|
+
return { "errors" => errors } if errors.present?
|
30
30
|
|
31
31
|
read_options = ActiveSupport::HashWithIndifferentAccess.new(_options.except(:doi, :id, :url,
|
32
32
|
:sandbox, :validate, :ra))
|
@@ -34,140 +34,146 @@ module Commonmeta
|
|
34
34
|
meta = string.present? ? JSON.parse(string) : {}
|
35
35
|
|
36
36
|
# optionally strip out the message wrapper from API
|
37
|
-
meta = meta.dig(
|
37
|
+
meta = meta.dig("data", "attributes") if meta.dig("data").present?
|
38
38
|
|
39
39
|
meta.transform_keys!(&:underscore)
|
40
40
|
|
41
|
-
id = normalize_doi(meta.fetch(
|
41
|
+
id = normalize_doi(meta.fetch("doi", nil))
|
42
42
|
|
43
|
-
resource_type_general = meta.dig(
|
44
|
-
resource_type = meta.dig(
|
43
|
+
resource_type_general = meta.dig("types", "resourceTypeGeneral")
|
44
|
+
resource_type = meta.dig("types", "resourceType")
|
45
45
|
# if resource_type is one of the new resource_type_general types introduced in schema 4.3, use it
|
46
46
|
type = Commonmeta::Utils::DC_TO_CM_TRANSLATIONS.fetch(resource_type, nil) ||
|
47
|
-
Commonmeta::Utils::DC_TO_CM_TRANSLATIONS.fetch(resource_type_general,
|
47
|
+
Commonmeta::Utils::DC_TO_CM_TRANSLATIONS.fetch(resource_type_general, "Other")
|
48
48
|
|
49
|
-
alternate_identifiers = Array.wrap(meta.fetch(
|
49
|
+
alternate_identifiers = Array.wrap(meta.fetch("alternate_identifiers", nil)).map do |i|
|
50
50
|
i.transform_keys! { |k| k.camelize(:lower) }
|
51
51
|
end
|
52
|
-
url = meta.fetch(
|
53
|
-
titles = Array.wrap(meta.fetch(
|
54
|
-
title.
|
52
|
+
url = meta.fetch("url", nil)
|
53
|
+
titles = Array.wrap(meta.fetch("titles", nil)).map do |title|
|
54
|
+
{ "title" => title.fetch("title", nil),
|
55
|
+
"type" => title.fetch("titleType", nil),
|
56
|
+
"language" => title.fetch("lang", nil) }.compact
|
55
57
|
end
|
56
|
-
contributors = get_authors(from_datacite(meta.fetch(
|
57
|
-
contributors += get_authors(from_datacite(meta.fetch(
|
58
|
-
if meta.fetch(
|
59
|
-
publisher = {
|
60
|
-
elsif meta.fetch(
|
61
|
-
publisher = {
|
58
|
+
contributors = get_authors(from_datacite(meta.fetch("creators", nil)))
|
59
|
+
contributors += get_authors(from_datacite(meta.fetch("contributors", nil)))
|
60
|
+
if meta.fetch("publisher", nil).is_a?(Hash)
|
61
|
+
publisher = { "name" => meta.fetch("publisher", nil).fetch("name", nil) }
|
62
|
+
elsif meta.fetch("publisher", nil).is_a?(String)
|
63
|
+
publisher = { "name" => meta.fetch("publisher", nil) }
|
62
64
|
else
|
63
65
|
publisher = nil
|
64
66
|
end
|
65
67
|
|
66
|
-
container = meta.fetch(
|
67
|
-
funding_references = meta.fetch(
|
68
|
+
container = meta.fetch("container", nil)
|
69
|
+
funding_references = meta.fetch("funding_references", nil)
|
68
70
|
|
69
71
|
date = {}
|
70
|
-
date[
|
71
|
-
get_iso8601_date(meta.dig(
|
72
|
-
date[
|
73
|
-
get_iso8601_date(meta.dig(
|
74
|
-
|
75
|
-
date[
|
76
|
-
date[
|
77
|
-
get_iso8601_date(meta.dig(
|
78
|
-
|
79
|
-
descriptions = Array.wrap(meta.fetch(
|
80
|
-
description.
|
72
|
+
date["created"] =
|
73
|
+
get_iso8601_date(meta.dig("created")) || get_date(meta.dig("dates"), "Created")
|
74
|
+
date["published"] =
|
75
|
+
get_iso8601_date(meta.dig("published")) || get_date(meta.dig("dates"),
|
76
|
+
"Issued") || get_iso8601_date(meta.dig("publication_year"))
|
77
|
+
date["registered"] = get_iso8601_date(meta.dig("registered"))
|
78
|
+
date["updated"] =
|
79
|
+
get_iso8601_date(meta.dig("updated")) || get_date(meta.dig("dates"), "Updated")
|
80
|
+
|
81
|
+
descriptions = Array.wrap(meta.fetch("descriptions", nil)).map do |description|
|
82
|
+
description_type = description.fetch("descriptionType", nil)
|
83
|
+
description_type = "Other" unless %w[Abstract Methods TechnicalInfo].include?(description_type)
|
84
|
+
{ "description" => description.fetch("description", nil),
|
85
|
+
"type" => description_type,
|
86
|
+
"language" => description.fetch("lang", nil) }.compact
|
81
87
|
end
|
82
|
-
license = Array.wrap(meta.fetch(
|
83
|
-
r[
|
88
|
+
license = Array.wrap(meta.fetch("rights_list", nil)).find do |r|
|
89
|
+
r["rightsUri"].present?
|
84
90
|
end
|
85
|
-
license = hsh_to_spdx(
|
86
|
-
version = meta.fetch(
|
87
|
-
subjects = meta.fetch(
|
88
|
-
language = meta.fetch(
|
89
|
-
geo_locations = meta.fetch(
|
90
|
-
references = (Array.wrap(meta.fetch(
|
91
|
-
nil)) + Array.wrap(meta.fetch(
|
91
|
+
license = hsh_to_spdx("rightsURI" => license["rightsUri"]) if license.present?
|
92
|
+
version = meta.fetch("version", nil)
|
93
|
+
subjects = meta.fetch("subjects", nil)
|
94
|
+
language = meta.fetch("language", nil)
|
95
|
+
geo_locations = meta.fetch("geo_locations", nil)
|
96
|
+
references = (Array.wrap(meta.fetch("related_identifiers",
|
97
|
+
nil)) + Array.wrap(meta.fetch("related_items",
|
92
98
|
nil))).select do |r|
|
93
|
-
|
94
|
-
|
99
|
+
%w[References Cites IsSupplementedBy].include?(r["relationType"])
|
100
|
+
end.map do |reference|
|
95
101
|
get_datacite_reference(reference)
|
96
102
|
end
|
97
|
-
files = Array.wrap(meta.fetch("content_url", nil)).map { |file| { "url" => file } }
|
98
|
-
formats = meta.fetch(
|
99
|
-
sizes = meta.fetch(
|
100
|
-
schema_version = meta.fetch(
|
101
|
-
state = id.present? || read_options.present? ?
|
102
|
-
|
103
|
-
{
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
103
|
+
files = Array.wrap(meta.fetch("content_url", nil)).map { |file| { "url" => file } }
|
104
|
+
formats = meta.fetch("formats", nil)
|
105
|
+
sizes = meta.fetch("sizes", nil)
|
106
|
+
schema_version = meta.fetch("schema_version", nil) || "http://datacite.org/schema/kernel-4"
|
107
|
+
state = id.present? || read_options.present? ? "findable" : "not_found"
|
108
|
+
|
109
|
+
{ "id" => id,
|
110
|
+
"type" => type,
|
111
|
+
"additional_type" => resource_type == type ? nil : resource_type,
|
112
|
+
"url" => url,
|
113
|
+
"titles" => titles,
|
114
|
+
"contributors" => contributors,
|
115
|
+
"container" => container,
|
116
|
+
"publisher" => publisher,
|
117
|
+
"provider" => "DataCite",
|
118
|
+
"alternate_identifiers" => alternate_identifiers.presence,
|
119
|
+
"references" => references,
|
120
|
+
"funding_references" => funding_references,
|
121
|
+
"files" => files.presence,
|
122
|
+
"date" => date.compact,
|
123
|
+
"descriptions" => descriptions,
|
124
|
+
"license" => license,
|
125
|
+
"version" => version,
|
126
|
+
"subjects" => subjects,
|
127
|
+
"language" => language,
|
128
|
+
"geo_locations" => geo_locations,
|
129
|
+
"formats" => formats,
|
130
|
+
"sizes" => sizes,
|
131
|
+
"state" => state }.compact # .merge(read_options)
|
126
132
|
end
|
127
133
|
|
128
134
|
def format_contributor(contributor)
|
129
|
-
type = contributor.fetch(
|
130
|
-
|
131
|
-
{
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
135
|
+
type = contributor.fetch("nameType", nil)
|
136
|
+
|
137
|
+
{ "name" => type == "Person" ? nil : contributor.fetch("name", nil),
|
138
|
+
"type" => type,
|
139
|
+
"givenName" => contributor.fetch("givenName", nil),
|
140
|
+
"familyName" => contributor.fetch("familyName", nil),
|
141
|
+
"nameIdentifiers" => contributor.fetch("nameIdentifiers", nil).presence,
|
142
|
+
"affiliations" => contributor.fetch("affiliations", nil).presence,
|
143
|
+
"contributorType" => contributor.fetch("contributorType", nil) }.compact
|
138
144
|
end
|
139
145
|
|
140
146
|
def get_datacite_reference(reference)
|
141
147
|
return nil unless reference.present? || !reference.is_a?(Hash)
|
142
148
|
|
143
|
-
key = reference[
|
149
|
+
key = reference["relatedIdentifier"]
|
144
150
|
doi = nil
|
145
151
|
url = nil
|
146
152
|
|
147
|
-
case reference[
|
148
|
-
when
|
149
|
-
doi = normalize_doi(reference[
|
150
|
-
when
|
151
|
-
url = reference[
|
153
|
+
case reference["relatedIdentifierType"]
|
154
|
+
when "DOI"
|
155
|
+
doi = normalize_doi(reference["relatedIdentifier"])
|
156
|
+
when "URL"
|
157
|
+
url = reference["relatedIdentifier"]
|
152
158
|
else
|
153
|
-
url = reference[
|
159
|
+
url = reference["relatedIdentifier"]
|
154
160
|
end
|
155
161
|
|
156
162
|
{
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
163
|
+
"key" => key,
|
164
|
+
"doi" => doi,
|
165
|
+
"url" => url,
|
166
|
+
"contributor" => reference.dig("author"),
|
167
|
+
"title" => reference.dig("article-title"),
|
168
|
+
"publisher" => reference.dig("publisher"),
|
169
|
+
"publicationYear" => reference.dig("year"),
|
170
|
+
"volume" => reference.dig("volume"),
|
171
|
+
"issue" => reference.dig("issue"),
|
172
|
+
"firstPage" => reference.dig("first-page"),
|
173
|
+
"lastPage" => reference.dig("last-page"),
|
174
|
+
"containerTitle" => reference.dig("journal-title"),
|
175
|
+
"edition" => nil,
|
176
|
+
"unstructured" => doi.nil? ? reference.dig("unstructured") : nil,
|
171
177
|
}.compact
|
172
178
|
end
|
173
179
|
end
|
@@ -5,7 +5,7 @@ require "pathname"
|
|
5
5
|
|
6
6
|
module Commonmeta
|
7
7
|
module SchemaUtils
|
8
|
-
COMMONMETA = File.read(File.expand_path("../../resources/commonmeta_v0.10.
|
8
|
+
COMMONMETA = File.read(File.expand_path("../../resources/commonmeta_v0.10.7.json",
|
9
9
|
__dir__))
|
10
10
|
|
11
11
|
def json_schema_errors
|
data/lib/commonmeta/version.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
{
|
2
2
|
"$schema": "http://json-schema.org/draft-07/schema#",
|
3
|
-
"$id": "https://commonmeta.org/commonmeta_v0.10.
|
4
|
-
"title": "Commonmeta v0.10.
|
3
|
+
"$id": "https://commonmeta.org/commonmeta_v0.10.7.json",
|
4
|
+
"title": "Commonmeta v0.10.7",
|
5
5
|
"description": "JSON representation of the Commonmeta schema.",
|
6
6
|
"additionalProperties": false,
|
7
7
|
"definitions": {
|
@@ -253,6 +253,10 @@
|
|
253
253
|
"description": "The type of the title.",
|
254
254
|
"type": "string",
|
255
255
|
"enum": ["AlternativeTitle", "Subtitle", "TranslatedTitle"]
|
256
|
+
},
|
257
|
+
"language": {
|
258
|
+
"description": "The language of the title. Use one of the language codes from the IETF BCP 47 standard.",
|
259
|
+
"type": "string"
|
256
260
|
}
|
257
261
|
},
|
258
262
|
"required": ["title"]
|
@@ -424,7 +428,11 @@
|
|
424
428
|
"type": {
|
425
429
|
"description": "The type of the description.",
|
426
430
|
"type": "string",
|
427
|
-
"enum": ["Abstract", "
|
431
|
+
"enum": ["Abstract", "Summary", "Methods", "TechnicalInfo", "Other"]
|
432
|
+
},
|
433
|
+
"language": {
|
434
|
+
"description": "The language of the title. Use one of the language codes from the IETF BCP 47 standard.",
|
435
|
+
"type": "string"
|
428
436
|
}
|
429
437
|
},
|
430
438
|
"required": ["description"]
|
data/spec/author_utils_spec.rb
CHANGED
@@ -58,6 +58,16 @@ describe Commonmeta::Metadata, vcr: true do
|
|
58
58
|
author = { "name" => "Tejas S. Sathe, MD" }
|
59
59
|
expect(subject.is_personal_name?(name: author["name"])).to be true
|
60
60
|
end
|
61
|
+
|
62
|
+
it "name with organization string" do
|
63
|
+
author = { "name" => "University of California, Santa Barbara" }
|
64
|
+
expect(subject.is_personal_name?(name: author["name"])).to be false
|
65
|
+
end
|
66
|
+
|
67
|
+
it "name with another organization string" do
|
68
|
+
author = { "name" => "Research Graph" }
|
69
|
+
expect(subject.is_personal_name?(name: author["name"])).to be false
|
70
|
+
end
|
61
71
|
end
|
62
72
|
|
63
73
|
context "cleanup_author" do
|
@@ -0,0 +1,317 @@
|
|
1
|
+
---
|
2
|
+
http_interactions:
|
3
|
+
- request:
|
4
|
+
method: get
|
5
|
+
uri: https://api.rogue-scholar.org/posts/05f01f68-ef81-47d7-a3c1-40aba91d358f
|
6
|
+
body:
|
7
|
+
encoding: ASCII-8BIT
|
8
|
+
string: ''
|
9
|
+
headers:
|
10
|
+
Connection:
|
11
|
+
- close
|
12
|
+
Host:
|
13
|
+
- api.rogue-scholar.org
|
14
|
+
User-Agent:
|
15
|
+
- http.rb/5.1.1
|
16
|
+
response:
|
17
|
+
status:
|
18
|
+
code: 200
|
19
|
+
message: OK
|
20
|
+
headers:
|
21
|
+
Content-Type:
|
22
|
+
- application/json
|
23
|
+
Content-Length:
|
24
|
+
- '23886'
|
25
|
+
Ratelimit-Limit:
|
26
|
+
- '15'
|
27
|
+
Ratelimit-Remaining:
|
28
|
+
- '14'
|
29
|
+
Ratelimit-Reset:
|
30
|
+
- '3'
|
31
|
+
Date:
|
32
|
+
- Wed, 31 Jan 2024 19:50:01 GMT
|
33
|
+
Server:
|
34
|
+
- Fly/ba9e227a (2024-01-26)
|
35
|
+
Via:
|
36
|
+
- 1.1 fly.io
|
37
|
+
Fly-Request-Id:
|
38
|
+
- 01HNGH4EZV3XQF20H1PZ6X5N07-fra
|
39
|
+
body:
|
40
|
+
encoding: UTF-8
|
41
|
+
string: '{"abstract":null,"archive_url":null,"authors":[{"name":"Research Graph"}],"blog":{"api":false,"archive_prefix":null,"authors":null,"backlog":0,"canonical_url":null,"category":"computerAndInformationSciences","created_at":1706685423,"current_feed_url":null,"description":"Stories
|
42
|
+
by Research Graph on Medium","favicon":"https://cdn-images-1.medium.com/fit/c/150/150/1*laJi0jBkVoGhXid7gD_DmQ.png","feed_format":"application/rss+xml","feed_url":"https://medium.com/@researchgraph/feed","filter":null,"funding":null,"generator":"Medium","generator_raw":"Medium","home_page_url":"https://medium.com/@researchgraph","id":"30da2ca9-8258-4ab5-acca-3919d9a5d98d","indexed":true,"issn":null,"language":"en","license":"https://creativecommons.org/licenses/by/4.0/legalcode","mastodon":"","plan":"Starter","prefix":"10.59350","relative_url":null,"ror":null,"secure":true,"slug":"researchgraph","status":"active","title":"Research
|
43
|
+
Graph","updated_at":1706151454,"use_api":null,"use_mastodon":false,"user_id":"a7e16958-1175-437c-b839-d4b8a47ec811","version":"https://jsonfeed.org/version/1.1"},"blog_name":"Research
|
44
|
+
Graph","blog_slug":"researchgraph","content_text":"**Tools and Platform for
|
45
|
+
Integration of Knowledge Graph with RAG\npipelines.**\n\n<figure>\n<img\nsrc=\"https://cdn-images-1.medium.com/max/1024/1*bJ3eWZ7301vYDzBomwdLfQ.png\"\nalt=\"Complex
|
46
|
+
network connected to books and showing information from magespace\" />\n<figcaption>Image
|
47
|
+
Created in <a\nhref=\"https://www.mage.space/\">https://www.mage.space/</a></figcaption>\n</figure>\n\nAuthors:
|
48
|
+
[Aland\nAstudillo](https://www.linkedin.com/in/aland-astudillo/), [Aishwarya\nNambissan](https://www.linkedin.com/in/aishwarya-nambissan-127229200/)\n\nMany
|
49
|
+
users of chatbots such as ChatGPT, have encountered the problem of\nreceiving
|
50
|
+
inappropriate or incompatible responses. There are several\nreasons why this
|
51
|
+
might\u00a0happen.\n\nOne reason is the lack of appropriate training data,
|
52
|
+
as chatbots are\nusually trained on large amounts of text and code. If the
|
53
|
+
data is\ninsufficient or of poor quality, the chatbot may misunderstand queries\nand
|
54
|
+
provide inaccurate responses. Another reason is that some chatbots\nare designed
|
55
|
+
for specific tasks or domains, which limits their ability\nto handle broader
|
56
|
+
queries or understand subtle nuances in conversation.\nAdditionally, chatbots
|
57
|
+
may struggle with natural language, which is\ncomplex and often ambiguous.
|
58
|
+
This can cause them to misunderstand a\nuser''s query and provide irrelevant
|
59
|
+
or off-topic responses. Finally,\nthere are technical limitations, such as
|
60
|
+
the chatbot''s inability to\nreason or make inferences.\n\nThis article explores
|
61
|
+
a potential solution by combining two influential\napproaches in the field
|
62
|
+
of Natural Language Processing\u200a---\u200aRetrieval\nAugmented Generation
|
63
|
+
(**RAG**) and Knowledge Graphs(**KGs**). We will\ndelve into the partnership
|
64
|
+
between these two entities, discuss the\nnotable technologies and software
|
65
|
+
used in their processes, and highlight\nvarious options for utilizing their
|
66
|
+
combined potential.\n\n### **RAG**\n\nRetrieval-Augmented Generation is the
|
67
|
+
process of optimizing the output\nof a large language model using a knowledge
|
68
|
+
base outside of its training\ndata sources before generating a response. It
|
69
|
+
takes an input and\nretrieves a set of relevant/supporting documents given
|
70
|
+
a source (e.g.,\nWikipedia). This can be thought of as a Large Language Model
|
71
|
+
(LLM) not\njust putting words together, but carefully selecting relevant\ninformation
|
72
|
+
from external sources and Knowledge Graphs to create\nwell-informed and detailed
|
73
|
+
responses.\n\n### RAG Retrieval Techniques\n\nThe following are some crucial
|
74
|
+
technologies that enable RAG''s impressive\nability to retrieve and incorporate
|
75
|
+
relevant information:\n\n**Vector Search**: It transforms text into numerical
|
76
|
+
vectors, capturing\ntheir meaning and nuances in a mathematical space, creating
|
77
|
+
a map of\nrelationships. Similar texts, like those discussing shared topics
|
78
|
+
or\nusing similar language, end up positioned close together in this space,\nallowing
|
79
|
+
vector search to quickly identify them as related. This allows\nlightning-fast
|
80
|
+
comparisons, finding similar texts based on meaning, not\njust keywords.\n\nAlgorithms
|
81
|
+
like [**Faiss**](https://github.com/facebookresearch/faiss)\nand [**Annoy**](https://github.com/spotify/annoy)
|
82
|
+
map text into dense\nvectors, enabling fast comparisons and retrieval of relevant
|
83
|
+
passages\nbased on semantic similarity.\n\n**Passage Ranking**: It is an internal
|
84
|
+
algorithm that scores candidate\ntext passages based on their relevance to
|
85
|
+
a query. It considers factors\nlike keyword frequency, keyword overlap, and
|
86
|
+
document structure to act\nlike a judge, sifting through information to select
|
87
|
+
the most fitting and\ninformative passages.\n\nKeyword overlap measures how
|
88
|
+
often the same keywords appear in **both**\nthe query and the candidate passage,
|
89
|
+
emphasizing shared vocabulary and\npotential relevance. It differs from keyword
|
90
|
+
frequency, which simply\ncounts how often individual keywords appear within
|
91
|
+
a passage, regardless\nof their presence in the\u00a0query.\n\nTechniques
|
92
|
+
like [**BM25**](https://github.com/getalp/wikIR) and\n[**TF-IDF**](https://github.com/marcocor/wikipedia-idf)
|
93
|
+
score candidate\npassages based on keyword overlap and frequency, ensuring
|
94
|
+
retrieved\ninformation truly fits the\u00a0context.\n\n**Graph Neural Networks**
|
95
|
+
(**GNNs**): They are neural networks designed\nto explore and learn from interconnected
|
96
|
+
data like maps, social\nnetworks, and other complex relationships. Unlike
|
97
|
+
traditional processing\nmethods that go through data in a linear fashion,
|
98
|
+
GNNs are capable of\nrecognizing hidden patterns and understanding relationships
|
99
|
+
like \"who\nknows who\" and \"what connects to what\" by \"hopping\" across
|
100
|
+
connections\nin\u00a0data.\n\nConsider a graph as a network of dots(nodes)
|
101
|
+
connected by lines (edges).\nEach dot represents some information, like a
|
102
|
+
person, object, or concept.\nThe lines tell you how these things relate to
|
103
|
+
each\u00a0other.\n\nGNNs work in rounds. In each\u00a0round:\n\n1. Message
|
104
|
+
Passing: Each node \"talks\" to its neighbors, sending\n messages along
|
105
|
+
the edges. These messages contain information about\n the node itself and
|
106
|
+
its features.\n2. Node Update: Each node receives messages from all its neighbors
|
107
|
+
and\n combines them with its own information. This update can involve\n calculations
|
108
|
+
and applying a special function.\n3. Output Calculation: Based on the updated
|
109
|
+
information, the network\n calculates an output for each node. This output
|
110
|
+
could be a\n prediction about the node''s category, its relationship to
|
111
|
+
another\n node, or some other relevant information.\n\nThis process repeats
|
112
|
+
for multiple rounds, allowing nodes to incorporate\ninformation from their
|
113
|
+
entire neighborhood, not just their direct\nneighbors. As the rounds progress,
|
114
|
+
the network learns to understand the\nrelationships between nodes and the
|
115
|
+
overall structure of the\u00a0graph.\n\nWhen dealing with Knowledge Graphs,
|
116
|
+
frameworks like\n[**PyTorch-Geometric**](https://readthedocs.org/projects/pytorch-geometric/)\nand
|
117
|
+
[**DeepMind''s\nGNN**](https://github.com/deepmind/deepmind-research/blob/master/learning_to_simulate/graph_network.py)\nlibrary
|
118
|
+
come into play. These frameworks allow GNNs to traverse\ninterconnected entities
|
119
|
+
and relationships within the graph, retrieve\nrelevant knowledge fragments,
|
120
|
+
and understand complex connections.\n\n### **Knowledge Graphs: The Structured
|
121
|
+
Wisdom\u00a0Library**\n\nA knowledge graph, also referred to as a semantic
|
122
|
+
network, is a\nstructure that represents a network of real-world entities
|
123
|
+
such as\nobjects, events, situations, or concepts. It helps to illustrate
|
124
|
+
the\nconstantly changing representations of the world, connecting entities\n(such
|
125
|
+
as \"Marie Curie\") and relationships (such as \"won Nobel Prize\") to\nform
|
126
|
+
a complex network of information. This information is typically\nstored in
|
127
|
+
a graph database and visualized as a graph structure, thus the\nterm knowledge
|
128
|
+
\"graph\".\n\nKGs go beyond simply finding relevant facts and delve deeper
|
129
|
+
into\nunderstanding the relationships and insights hidden within using these\nprocesses:\n\n**Entity
|
130
|
+
Linking**: Imagine a vast network of information, like a big\npuzzle of dots.
|
131
|
+
Now imagine trying to connect specific names, places,\nand concepts to their
|
132
|
+
corresponding dots in the puzzle. That is what\nentity linking does with text
|
133
|
+
and knowledge graphs, connecting the\nspecific components of the text to the
|
134
|
+
corresponding nodes in the graph.\nThey help systems understand the exact
|
135
|
+
meaning of entities, and find\nrelevant information from the\u00a0graph.\n\nLibraries
|
136
|
+
like [**DGL-KeLP**](https://github.com/awslabs/dgl-ke)\nleverage GNNs to identify
|
137
|
+
and link named entities (like \"Marie Curie\")\nto their respective nodes
|
138
|
+
within the Knowledge Graphs, enabling RAG to\nretrieve information that is
|
139
|
+
directly relevant to the core subject of a\nsearch\u00a0query\n\n**Path Mining**:
|
140
|
+
Path mining is a process of uncovering hidden\nrelationships and patterns
|
141
|
+
that are not easily noticeable. It involves\nexploring complicated networks
|
142
|
+
of information and identifying and\ntracing connections between entities that
|
143
|
+
may seem unrelated. By doing\nso, path mining reveals surprising insights
|
144
|
+
and useful knowledge,\nimproving our understanding of the complex structures
|
145
|
+
within knowledge\ngraphs.\n\nTools like [**Neo4j**](https://neo4j.com/) and\n[**Stanza**](https://github.com/stanfordnlp/stanza)
|
146
|
+
allow traversing\npaths between entities, uncovering hidden relationships,
|
147
|
+
and generating\ninsightful responses based on this deeper understanding.\n\n**Reasoning
|
148
|
+
and Inference**: In the context of knowledge graphs,\nreasoning and inference
|
149
|
+
are not just limited to discovering facts; they\nare also concerned with utilizing
|
150
|
+
them effectively. This involves\nintegrating data, drawing meaningful connections,
|
151
|
+
and using logical\nreasoning to resolve issues, foresee future occurrences,
|
152
|
+
or even\nconstruct narratives leveraging the insights provided by the knowledge\ngraph.\n\nConsider
|
153
|
+
the scenario of trying to find an organization that works in\nspecific sectors
|
154
|
+
with the help of a knowledge graph. This analogy\neffectively highlights the
|
155
|
+
active role of reasoning and inference in\nknowledge graphs:\n\n1. Gathering
|
156
|
+
Facts: Knowledge graphs collect and organize information\n from various
|
157
|
+
sources, such as websites, databases, academic papers,\n and social media
|
158
|
+
platforms. These facts are represented as\n structured data, with entities
|
159
|
+
(e.g., organizations) and their\n attributes (e.g., sectors in which they
|
160
|
+
operate) forming nodes and\n edges in the graph. By combining data about
|
161
|
+
organizations and\n sectors, knowledge graphs enable the gathering of relevant
|
162
|
+
facts for\n analysis.\n2. Integrating information: By connecting an organization''s\n relationships
|
163
|
+
with specific sectors, such as partnerships,\n investments, or certifications,
|
164
|
+
knowledge graphs reveal the scope\n and relevance of their work within
|
165
|
+
those sectors. Links to related\n entities like employees, board members,
|
166
|
+
or projects can further\n contribute to understanding an organization''s
|
167
|
+
involvement in\n specific\u00a0sectors.\n3. Predicting and Creating: Knowledge
|
168
|
+
graphs can leverage machine\n learning and predictive models to infer missing
|
169
|
+
or hidden\n information. By analyzing the available facts and connections
|
170
|
+
within\n the graph, these models can predict an organization''s potential\n involvement
|
171
|
+
in sectors that have common attributes with their known\n areas of operation.
|
172
|
+
For example, if an organization has expertise in\n renewable energy, predictive
|
173
|
+
models could suggest their likely\n involvement in related sectors like
|
174
|
+
clean transportation or\n sustainable infrastructure. Additionally, knowledge
|
175
|
+
graphs\n facilitate the creation of new information and insights by combining\n existing
|
176
|
+
facts with external data sources. For instance, by\n integrating real-time
|
177
|
+
data on industry trends, market analysis, or\n news articles, knowledge
|
178
|
+
graphs enable the discovery of emerging\n sectors or upcoming organizations
|
179
|
+
that might align with the given\n parameters.\n\nA framework like [**Atomspace**](https://github.com/opencog/atomspace)\nfrom
|
180
|
+
[**OpenCog**](https://opencog.org/) empowers RAG to reason and\ninfer new
|
181
|
+
knowledge. By traversing paths and combining information from\ninterconnected
|
182
|
+
entities, the system can generate informed predictions or\nanswer hypothetical
|
183
|
+
questions.\n\n### Purpose\n\nThe combination of Retrieval-Augmented Generation
|
184
|
+
(RAG) and Knowledge\nGraphs (KG) is beneficial for several\u00a0reasons:\n\n1. **Enhanced
|
185
|
+
information retrieval**: Knowledge graphs provide\n structured and interconnected
|
186
|
+
information that can significantly\n improve the effectiveness of information
|
187
|
+
retrieval. By using KGs,\n RAG models can retrieve more accurate and relevant
|
188
|
+
information,\n leading to better generation and response\u00a0quality.\n2. **Reliable
|
189
|
+
and diverse information:** KGs are constructed from\n authoritative sources,
|
190
|
+
making them reliable and trustworthy sources\n of information. RAG models
|
191
|
+
can leverage this reliable information to\n generate more accurate responses.
|
192
|
+
Additionally, KGs help in\n diversifying the generated responses by providing
|
193
|
+
a broader pool of\n related facts and entities.\n3. **Context-aware understanding**:
|
194
|
+
KGs enable RAG models to understand\n and reason over the contextual information.
|
195
|
+
By leveraging the\n relationships and semantic connections encoded in KGs,
|
196
|
+
RAG models\n can better grasp the context of user queries or conversations,\n resulting
|
197
|
+
in more coherent and appropriate responses.\n4. **Handling complex queries**:
|
198
|
+
KGs allow RAG models to tackle complex\n queries by breaking them down
|
199
|
+
into smaller sub-queries, retrieving\n relevant pieces of information from
|
200
|
+
the KG, and then generating a\n response based on the retrieved knowledge.
|
201
|
+
This enables RAG models\n to handle a wide range of user queries effectively.\n5. **Explainability
|
202
|
+
and transparency**: KGs provide a transparent and\n interpretable representation
|
203
|
+
of knowledge. By integrating KG-based\n retrieval into RAG models, the
|
204
|
+
reasoning behind the generated\n responses becomes more explainable. Users
|
205
|
+
can have a clear\n understanding of the knowledge sources and connections
|
206
|
+
used to\n produce the response.\n6. **Scalability**: Knowledge graphs
|
207
|
+
act as large-scale repositories of\n information. RAG models can leverage
|
208
|
+
KGs to generate responses to\n various queries or conversations without
|
209
|
+
requiring additional\n supervised training data. This makes the RAG+KG
|
210
|
+
approach scalable to\n handle an extensive range of knowledge domains and
|
211
|
+
user\u00a0queries.\n\n### **Pipeline Possibilities: Orchestrating RAG and\u00a0KGs:**\n\nLet''s
|
212
|
+
explore some exciting pipeline options for harnessing the combined\npower
|
213
|
+
of RAG and Knowledge Graphs. There are two options in which either\nthe LLM
|
214
|
+
is prioritized or the Knowledge Graph is prioritized:\n\n**Option 1: LLM-Centric
|
215
|
+
Pipeline:**\n\nThe LLM-Centric pipeline is a RAG and Knowledge Graph combination
|
216
|
+
that\nempowers LLMs to craft well-informed responses. Here''s how it\u00a0works:\n\n1. Start
|
217
|
+
with the user''s question or statement\n2. The LLM (like GPT-3) generates
|
218
|
+
an initial draft response based on\n its internal knowledge. This draft
|
219
|
+
may lack specific factual details\n or nuances that a knowledge graph can\u00a0provide.\n3. RAG
|
220
|
+
kicks in, searching the text corpus or the Knowledge Graph for\n relevant
|
221
|
+
passages that enrich the draft. During the retrieval\n process, RAG retrieval
|
222
|
+
techniques are used to search not only text\n corpora but also knowledge
|
223
|
+
graphs to find relevant information. This\n means that RAG can directly
|
224
|
+
tap into the structured knowledge within\n the graph to retrieve facts,
|
225
|
+
relationships, and entities that align\n with the user''s query and the
|
226
|
+
LLM''s generated draft.\n4. The retrieved information is carefully fused
|
227
|
+
with the LLM''s output,\n creating a more factually accurate and insightful
|
228
|
+
response\n5. A final polishing step ensures the response is fluent, grammatically\n correct,
|
229
|
+
and ready to\u00a0show.\n\n<figure>\n<img\nsrc=\"https://cdn-images-1.medium.com/max/1024/0*3pd9MOIflkbS07wI\"
|
230
|
+
/>\n<figcaption>RAG LLM-centric generic\u00a0scheme.</figcaption>\n</figure>\n\nThe
|
231
|
+
basic steps to perform this\u00a0are:\n\n1. **Pre-processing**: Clean and
|
232
|
+
tokenize user input to prepare for\n processing.\n2. **LLM Generation**:
|
233
|
+
Generate an initial draft response using an LLM\n like [**GPT-3**](https://openai.com/product)
|
234
|
+
or [**Jurassic-1\n Jumbo**](https://www.livescience.com/google-sentient-ai-lamda-lemoine).\n3. **Retrieval**:
|
235
|
+
Employ RAG techniques to retrieve relevant passages\n from a text corpus
|
236
|
+
or Knowledge Graphs.\n4. **Fusion**: Integrate retrieved information into
|
237
|
+
the LLM-generated\n draft, creating a more informed and factually-grounded
|
238
|
+
response.\n5. **Post-processing**: Refine the final response for fluency,\n grammatical
|
239
|
+
correctness, and overall coherence.\n\n**Option 2: Knowledge Graphs-Centric
|
240
|
+
Pipeline:**\n\nIn this approach, knowledge graphs take center stage. In essence,
|
241
|
+
this\npipeline prioritizes the structured knowledge within knowledge graphs,\nusing
|
242
|
+
RAG retrieval techniques to translate those insights into\ncompelling and
|
243
|
+
informative language. Here''s how it\u00a0unfolds:\n\n1. User input: The
|
244
|
+
process begins with the user''s question or statement\n2. Graph exploration:
|
245
|
+
The knowledge graph is meticulously explored to\n identify relevant entities,
|
246
|
+
relationships, and paths that align with\n the user''s input. This stage
|
247
|
+
involves techniques like entity\n linking, path mining, and reasoning to
|
248
|
+
uncover valuable information\n within the\u00a0graph\n3. Response planning:
|
249
|
+
The insights extracted from the graph are used to\n create a structured
|
250
|
+
response plan. This plan outlines the key\n points, facts, and logical
|
251
|
+
flow that the final response\n should\u00a0embody\n4. Language generation:
|
252
|
+
This is where RAG steps in. Its purpose is to\n create human-like text
|
253
|
+
that follows the response plan. It uses LLMs\n to produce well-written
|
254
|
+
sentences and paragraphs, combining the\n relevant information from the
|
255
|
+
knowledge graph while maintaining\n cohesiveness and readability.\n5. Post-processing:
|
256
|
+
The generated response undergoes a final refinement\n process to ensure
|
257
|
+
grammatical correctness, clarity, and\n overall\u00a0quality\n\n<figure>\n<img\nsrc=\"https://cdn-images-1.medium.com/max/1024/0*mZ83esKBjbPmCq_C\"
|
258
|
+
/>\n<figcaption>RAG Knowledge Graph-centric generic\u00a0scheme.</figcaption>\n</figure>\n\nThe
|
259
|
+
basic steps\u00a0are:\n\n1. **Query Formulation**: Transform the user input
|
260
|
+
into a query\n suitable for Knowledge Graph''s exploration.\n2. **Knowledge
|
261
|
+
Graphs:** You can use either Neo4j or\n [NebulaGraph](https://www.nebula-graph.io/)
|
262
|
+
to implement a retrieval\n enhancement technique. This technique involves
|
263
|
+
utilizing a knowledge\n graph to illustrate the connections between entities
|
264
|
+
and\n relationships. Additionally, it incorporates a powerful language\n model
|
265
|
+
to improve the retrieval process.\n3. **Fact Selection**: Employ entity linking
|
266
|
+
and reasoning algorithms\n to select and prioritize the most relevant facts
|
267
|
+
based on the query\n and\u00a0context.\n4. **Natural Language Generation**
|
268
|
+
(**NLG**): Utilise specialized NLG\n models like\n [BART](https://research.facebook.com/publications/controllable-abstractive-summarization/)\n to
|
269
|
+
translate the extracted facts into a natural language response.\n5. **Refinement**:
|
270
|
+
Enhance the generated response for clarity and\n coherence.\n\n### **Unveiling
|
271
|
+
a Future of Intelligent Interaction**\n\nThe combination of RAG and Knowledge
|
272
|
+
Graphs goes beyond just being a\ntechnological fusion. It paves the way for
|
273
|
+
a future where the\ninteraction between humans and computers goes beyond simple
|
274
|
+
words and\nbecomes a more informed and refined form of communication. As these\ntechnologies
|
275
|
+
continue to develop, we can expect to witness a significant\ntransformation
|
276
|
+
in:\n\n- AI-powered assistants that answer your questions with the confidence\n of
|
277
|
+
a well-read friend, seamlessly combining relevant facts and\n insights gleaned
|
278
|
+
from Knowledge Graphs.\n- Next-generation search engines that go beyond keyword
|
279
|
+
matching,\n understanding the deeper meaning behind your queries and delivering\n comprehensive,
|
280
|
+
contextual results enriched with information from\n Knowledge Graphs.\n-
|
281
|
+
Creative writing tools that utilize RAG and Knowledge Graphs to\n generate
|
282
|
+
stories that are both factually accurate and full of\n unexpected plot twists
|
283
|
+
and character development, moving beyond\n clich\u00e9d patterns.\n\n###
|
284
|
+
**Conclusion**\n\nThe convergence of Retrieval Augmented Generation (RAG)
|
285
|
+
and Knowledge\nGraphs (KGs) brings about an exciting synergy in the world
|
286
|
+
of Natural\nLanguage Processing (NLP). RAG enhances the output of large language\nmodels
|
287
|
+
by carefully selecting relevant information from external sources\nand KGs,
|
288
|
+
allowing for well-informed and detailed responses. KGs, on the\nother hand,
|
289
|
+
provide a structured representation of real-world entities\nand their relationships,
|
290
|
+
enabling the exploration of hidden insights and\nthe discovery of complex
|
291
|
+
connections.\n\nThe integration of RAG and KGs opens up two pipeline possibilities.
|
292
|
+
The\nLLM-centric pipeline prioritizes the language model''s output, which
|
293
|
+
is\nthen enriched with information retrieved from KGs. The Knowledge\nGraphs-centric
|
294
|
+
pipeline, on the other hand, places KGs at the center,\nutilizing RAG techniques
|
295
|
+
to translate the structured insights into\ncompelling and informative language.\n\nWhile
|
296
|
+
integrating LLMs and a knowledge graph for content retrieval\nrequires careful
|
297
|
+
planning, the reward is significant. You can gain\naccess to hidden relationships
|
298
|
+
within information, ultimately leading to\nhigher-quality output information.\n\nTools
|
299
|
+
like **OpenAI**, **Langchain**, and **LlamaIndex** provide\nready-made pipelines
|
300
|
+
to integrate knowledge graphs (like **Neo4j**)\neasily. Meanwhile, open-source
|
301
|
+
LLMs like **Mistral**, **Llama**, and\n**Dolphin** are catching up to proprietary
|
302
|
+
models in performance, making\nthem attractive choices for building custom
|
303
|
+
architectures. This\nopen-source scenario allows for the exploration and examination
|
304
|
+
of\nvarious methods before fully committing to a particular technological\nframework.
|
305
|
+
So, it is crucial to evaluate your needs and choose the\napproach that best
|
306
|
+
fits your use\u00a0case.\n\n{width=\"1\"\nheight=\"1\"}\n","doi":"https://doi.org/10.59350/jhrs4-22440","guid":"https://medium.com/p/fc0a6900f7eb","id":"05f01f68-ef81-47d7-a3c1-40aba91d358f","image":"https://cdn-images-1.medium.com/max/1024/1*bJ3eWZ7301vYDzBomwdLfQ.png","indexed_at":1706690571,"language":"en","published_at":1705557796,"reference":[],"relationships":[],"summary":"<strong>\n
|
307
|
+
Tools and Platform for Integration of Knowledge Graph with RAG pipelines.\n</strong>\nAuthors:
|
308
|
+
Aland Astudillo, Aishwarya Nambissan Many users of chatbots such as ChatGPT,
|
309
|
+
have encountered the problem of receiving inappropriate or incompatible responses.
|
310
|
+
There are several reasons why this might\u00a0happen. One reason is the lack
|
311
|
+
of appropriate training data, as chatbots are usually trained on large amounts
|
312
|
+
of text and code.","tags":["Artificial-intelligence","Machine-learning","Retrieval-augmented","Knowledge-graph"],"title":"Unveiling
|
313
|
+
the Synergy: Retrieval Augmented Generation (RAG) Meets Knowledge Graphs","updated_at":1705557796,"url":"https://medium.com/@researchgraph/unveiling-the-synergy-retrieval-augmented-generation-rag-meets-knowledge-graphs-fc0a6900f7eb"}
|
314
|
+
|
315
|
+
'
|
316
|
+
recorded_at: Wed, 31 Jan 2024 19:50:01 GMT
|
317
|
+
recorded_with: VCR 6.2.0
|
@@ -90,7 +90,7 @@ describe Commonmeta::Metadata, vcr: true do
|
|
90
90
|
"affiliation" => [{ "name" => "Тверская государственная сельскохозяйственная академия" }], "familyName" => "Ганичева", "givenName" => "А.В.", "type" => "Person", "contributorRoles" => ["Author"],
|
91
91
|
)
|
92
92
|
expect(subject.titles.last).to eq("title" => "MODEL OF SYSTEM DYNAMICS OF PROCESS OF TRAINING",
|
93
|
-
"
|
93
|
+
"type" => "TranslatedTitle")
|
94
94
|
expect(subject.date).to eq("created" => "2019-02-12", "published" => "2019",
|
95
95
|
"registered" => "2019-02-12", "updated" => "2022-08-23")
|
96
96
|
expect(subject.publisher).to eq("name" => "МОДЕЛИРОВАНИЕ, ОПТИМИЗАЦИЯ И ИНФОРМАЦИОННЫЕ ТЕХНОЛОГИИ")
|
@@ -114,10 +114,14 @@ describe Commonmeta::Metadata, vcr: true do
|
|
114
114
|
expect(subject.contributors.first).to eq(
|
115
115
|
"name" => "Europäische Kommission", "contributorRoles" => ["Author"], "type" => "Organization",
|
116
116
|
)
|
117
|
-
expect(subject.titles).to eq([
|
118
|
-
|
119
|
-
|
120
|
-
|
117
|
+
expect(subject.titles).to eq([{ "language" => "de", "title" => "Flash Eurobarometer 54 (Madrid Summit)" },
|
118
|
+
{ "language" => "en", "title" => "Flash Eurobarometer 54 (Madrid Summit)" },
|
119
|
+
{ "language" => "de",
|
120
|
+
"title" => "The Common European Currency",
|
121
|
+
"type" => "Subtitle" },
|
122
|
+
{ "language" => "en",
|
123
|
+
"title" => "The Common European Currency",
|
124
|
+
"type" => "Subtitle" }])
|
121
125
|
expect(subject.subjects).to eq([{ "lang" => "en",
|
122
126
|
"subject" => "KAT12 International Institutions, Relations, Conditions",
|
123
127
|
"subjectScheme" => "ZA" },
|
@@ -163,14 +167,39 @@ describe Commonmeta::Metadata, vcr: true do
|
|
163
167
|
expect(subject.contributors.length).to eq(23)
|
164
168
|
expect(subject.contributors[0]).to eq("contributorRoles" => ["Author"], "familyName" => "ExampleFamilyName", "givenName" => "ExampleGivenName", "type" => "Person")
|
165
169
|
expect(subject.contributors[2]).to eq("contributorRoles" => ["ContactPerson"], "familyName" => "ExampleFamilyName", "givenName" => "ExampleGivenName", "type" => "Person")
|
166
|
-
expect(subject.date).to eq("created"=>"2022-10-27", "published"=>"2022", "registered"=>"2022-10-27", "updated"=>"2024-01-02")
|
170
|
+
expect(subject.date).to eq("created" => "2022-10-27", "published" => "2022", "registered" => "2022-10-27", "updated" => "2024-01-02")
|
167
171
|
expect(subject.publisher).to eq("name" => "Example Publisher")
|
168
|
-
expect(subject.
|
172
|
+
expect(subject.titles).to eq([{ "language" => "en", "title" => "Example Title" },
|
173
|
+
{ "language" => "en", "title" => "Example Subtitle", "type" => "Subtitle" },
|
174
|
+
{ "language" => "fr",
|
175
|
+
"title" => "Example TranslatedTitle",
|
176
|
+
"type" => "TranslatedTitle" },
|
177
|
+
{ "language" => "en",
|
178
|
+
"title" => "Example AlternativeTitle",
|
179
|
+
"type" => "AlternativeTitle" }])
|
180
|
+
expect(subject.descriptions).to eq([{ "description" => "Example Abstract",
|
181
|
+
"type" => "Abstract",
|
182
|
+
"language" => "en" },
|
183
|
+
{ "description" => "Example Methods",
|
184
|
+
"type" => "Methods",
|
185
|
+
"language" => "en" },
|
186
|
+
{ "description" => "Example SeriesInformation",
|
187
|
+
"type" => "Other",
|
188
|
+
"language" => "en" },
|
189
|
+
{ "description" => "Example TableOfContents",
|
190
|
+
"type" => "Other",
|
191
|
+
"language" => "en" },
|
192
|
+
{ "description" => "Example TechnicalInfo",
|
193
|
+
"type" => "TechnicalInfo",
|
194
|
+
"language" => "en" },
|
195
|
+
{ "description" => "Example Other", "type" => "Other", "language" => "en" }])
|
196
|
+
expect(subject.license).to eq("id" => "CC-PDDC", "url" => "https://creativecommons.org/licenses/publicdomain/")
|
169
197
|
end
|
170
198
|
|
171
199
|
it "instrument" do
|
172
200
|
input = "#{fixture_path}datacite-instrument.json"
|
173
201
|
subject = described_class.new(input: input)
|
202
|
+
puts subject.errors unless subject.valid?
|
174
203
|
expect(subject.valid?).to be true
|
175
204
|
expect(subject.id).to eq("https://doi.org/10.82433/08qf-ee96")
|
176
205
|
expect(subject.type).to eq("Instrument")
|
@@ -189,6 +189,31 @@ describe Commonmeta::Metadata, vcr: true do
|
|
189
189
|
expect(subject.references).to be_nil
|
190
190
|
end
|
191
191
|
|
192
|
+
it "medium post with institutional author" do
|
193
|
+
input = "https://api.rogue-scholar.org/posts/05f01f68-ef81-47d7-a3c1-40aba91d358f"
|
194
|
+
subject = described_class.new(input: input)
|
195
|
+
# expect(subject.valid?).to be true
|
196
|
+
expect(subject.id).to eq("https://doi.org/10.59350/jhrs4-22440")
|
197
|
+
expect(subject.url).to eq("https://medium.com/@researchgraph/unveiling-the-synergy-retrieval-augmented-generation-rag-meets-knowledge-graphs-fc0a6900f7eb")
|
198
|
+
expect(subject.alternate_identifiers).to eq([{ "alternateIdentifier" => "05f01f68-ef81-47d7-a3c1-40aba91d358f", "alternateIdentifierType" => "UUID" }])
|
199
|
+
expect(subject.type).to eq("Article")
|
200
|
+
expect(subject.contributors.length).to eq(1)
|
201
|
+
expect(subject.contributors.first).to eq("contributorRoles"=>["Author"], "name"=>"Research Graph", "type"=>"Organization")
|
202
|
+
expect(subject.titles).to eq([{ "title" => "Unveiling the Synergy: Retrieval Augmented Generation (RAG) Meets Knowledge Graphs" }])
|
203
|
+
expect(subject.license).to eq("id" => "CC-BY-4.0",
|
204
|
+
"url" => "https://creativecommons.org/licenses/by/4.0/legalcode")
|
205
|
+
expect(subject.date).to eq("published"=>"2024-01-18", "updated"=>"2024-01-18")
|
206
|
+
expect(subject.descriptions.first["description"]).to start_with("<strong> Tools and Platform for Integration of Knowledge Graph with RAG pipelines.")
|
207
|
+
expect(subject.publisher).to eq("name" => "Research Graph")
|
208
|
+
expect(subject.subjects).to eq([{ "subject" => "Computer and information sciences" },
|
209
|
+
{ "schemeUri" => "http://www.oecd.org/science/inno/38235147.pdf",
|
210
|
+
"subject" => "FOS: Computer and information sciences",
|
211
|
+
"subjectScheme" => "Fields of Science and Technology (FOS)" }])
|
212
|
+
expect(subject.language).to eq("en")
|
213
|
+
expect(subject.container).to eq("identifier" => "https://medium.com/@researchgraph", "identifierType" => "URL", "title" => "Research Graph", "type" => "Periodical")
|
214
|
+
expect(subject.references).to be_nil
|
215
|
+
end
|
216
|
+
|
192
217
|
it "syldavia gazette post with references" do
|
193
218
|
input = "https://api.rogue-scholar.org/posts/0022b9ef-525a-4a79-81ad-13411697f58a"
|
194
219
|
subject = described_class.new(input: input)
|
@@ -33,11 +33,38 @@ describe Commonmeta::Metadata, vcr: true do
|
|
33
33
|
"volume" => "426",
|
34
34
|
"firstPage" => "181",
|
35
35
|
"containerTitle" => "Nature")
|
36
|
-
expect(json["date"]).to eq("published"=>"2014-02-11", "updated"=>"2022-03-26")
|
36
|
+
expect(json["date"]).to eq("published" => "2014-02-11", "updated" => "2022-03-26")
|
37
37
|
expect(json["descriptions"].first["description"]).to start_with("Among various advantages,")
|
38
|
-
expect(json["license"]).to eq("id"=>"CC-BY-3.0", "url"=>"https://creativecommons.org/licenses/by/3.0/legalcode")
|
38
|
+
expect(json["license"]).to eq("id" => "CC-BY-3.0", "url" => "https://creativecommons.org/licenses/by/3.0/legalcode")
|
39
39
|
expect(json["provider"]).to eq("Crossref")
|
40
|
-
expect(json["files"].first).to eq("mimeType"=>"application/pdf", "url"=>"https://cdn.elifesciences.org/articles/01567/elife-01567-v1.pdf")
|
40
|
+
expect(json["files"].first).to eq("mimeType" => "application/pdf", "url" => "https://cdn.elifesciences.org/articles/01567/elife-01567-v1.pdf")
|
41
|
+
end
|
42
|
+
|
43
|
+
it "dataset schema v4.5" do
|
44
|
+
input = "#{fixture_path}datacite-dataset_v4.5.json"
|
45
|
+
subject = described_class.new(input: input)
|
46
|
+
expect(subject.id).to eq("https://doi.org/10.82433/b09z-4k37")
|
47
|
+
json = JSON.parse(subject.commonmeta)
|
48
|
+
expect(json["id"]).to eq("https://doi.org/10.82433/b09z-4k37")
|
49
|
+
expect(json["type"]).to eq("Dataset")
|
50
|
+
expect(json["titles"]).to eq([{ "language" => "en", "title" => "Example Title" },
|
51
|
+
{ "language" => "en", "title" => "Example Subtitle", "type" => "Subtitle" },
|
52
|
+
{ "language" => "fr",
|
53
|
+
"title" => "Example TranslatedTitle",
|
54
|
+
"type" => "TranslatedTitle" },
|
55
|
+
{ "language" => "en",
|
56
|
+
"title" => "Example AlternativeTitle",
|
57
|
+
"type" => "AlternativeTitle" }])
|
58
|
+
expect(json["descriptions"]).to eq([{ "description" => "Example Abstract", "language" => "en", "type" => "Abstract" },
|
59
|
+
{ "description" => "Example Methods", "language" => "en", "type" => "Methods" },
|
60
|
+
{ "description" => "Example SeriesInformation",
|
61
|
+
"language" => "en",
|
62
|
+
"type" => "Other" },
|
63
|
+
{ "description" => "Example TableOfContents", "language" => "en", "type" => "Other" },
|
64
|
+
{ "description" => "Example TechnicalInfo",
|
65
|
+
"language" => "en",
|
66
|
+
"type" => "TechnicalInfo" },
|
67
|
+
{ "description" => "Example Other", "language" => "en", "type" => "Other" }])
|
41
68
|
end
|
42
69
|
end
|
43
70
|
end
|
@@ -7,6 +7,7 @@ describe Commonmeta::Metadata, vcr: true do
|
|
7
7
|
it 'Dataset' do
|
8
8
|
input = 'https://doi.org/10.5061/DRYAD.8515'
|
9
9
|
subject = described_class.new(input: input, from: 'datacite')
|
10
|
+
puts subject.errors unless subject.valid?
|
10
11
|
expect(subject.valid?).to be true
|
11
12
|
json = JSON.parse(subject.csl)
|
12
13
|
expect(json['type']).to eq('dataset')
|
@@ -37,6 +37,7 @@ describe Commonmeta::Metadata, vcr: true do
|
|
37
37
|
it 'text' do
|
38
38
|
input = 'https://doi.org/10.3204/desy-2014-01645'
|
39
39
|
subject = described_class.new(input: input, from: 'datacite')
|
40
|
+
puts subject.errors unless subject.valid?
|
40
41
|
expect(subject.valid?).to be true
|
41
42
|
csv = subject.csv.parse_csv
|
42
43
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: commonmeta-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.12.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Martin Fenner
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-01-
|
11
|
+
date: 2024-01-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -694,7 +694,7 @@ files:
|
|
694
694
|
- lib/commonmeta/xml_converter.rb
|
695
695
|
- resources/2008/09/xsd.xsl
|
696
696
|
- resources/cff.json
|
697
|
-
- resources/commonmeta_v0.10.
|
697
|
+
- resources/commonmeta_v0.10.7.json
|
698
698
|
- resources/crossref/AccessIndicators.xsd
|
699
699
|
- resources/crossref/JATS-journalpublishing1-3d2-mathml3-elements.xsd
|
700
700
|
- resources/crossref/JATS-journalpublishing1-3d2-mathml3.xsd
|
@@ -921,6 +921,7 @@ files:
|
|
921
921
|
- spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/ghost_post_without_doi.yml
|
922
922
|
- spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/jekyll_post.yml
|
923
923
|
- spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/jekyll_post_with_anonymous_author.yml
|
924
|
+
- spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/medium_post_with_institutional_author.yml
|
924
925
|
- spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/substack_post_with_broken_reference.yml
|
925
926
|
- spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/syldavia_gazette_post_with_references.yml
|
926
927
|
- spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/upstream_post_with_references.yml
|