commonmeta-ruby 3.11.0 → 3.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7bc4df0a364fc1ff53342960355c5fc534a0c479cf825cc0d856e282e42e7d49
4
- data.tar.gz: 57dcd210498057807f46ffb851eb40b4db189636d915152a77de1fd1191012c9
3
+ metadata.gz: 5acc4ac253ffc536724d14bf4d6cc58710978da3587c21035d946889b454a959
4
+ data.tar.gz: c1eac95196a7e2f01b52c5f6f94e0055299554e59824dfc88bda9de661c34193
5
5
  SHA512:
6
- metadata.gz: 2a4265c7a8d17ab99b963459015232400bcd7541e4c827e2bf0863e9809fbe64d70d0ce6ced7dbcf2a9186beca2075be73617bd3d3cace2e4b9c67e39d8e756a
7
- data.tar.gz: 765b9d3d29683badac4ed6b77ac6f86f6b939f5081d23016c1484a9f8b7e0d26039559a9f6d5c2e2b8fe5c06bc1d70d5e09e17f16a5a24c6c9bb84291a8c1548
6
+ metadata.gz: 95b8264ab1e837f26971d12df81ec4b3fc156d21d63387b0053646aacf80f8be6b03d77837759d2876ecd35c05400114a8c559df45219c74160422e58b73d868
7
+ data.tar.gz: 1cca6f5bfa1bd30d966744931a88054cea207f4ed50f1c94b5a428e656be4f7216b71055f9e66d2ac03cb60b2140e3610f1a8c60c5bd26d4644daa0d4ece0b13
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- commonmeta-ruby (3.11.0)
4
+ commonmeta-ruby (3.12.0)
5
5
  activesupport (>= 4.2.5, < 8.0)
6
6
  addressable (~> 2.8.1, < 2.8.2)
7
7
  base32-url (>= 0.7.0, < 1)
@@ -58,7 +58,8 @@ GEM
58
58
  rubocop (~> 1.0)
59
59
  concurrent-ruby (1.2.3)
60
60
  connection_pool (2.4.1)
61
- crack (0.4.5)
61
+ crack (0.4.6)
62
+ bigdecimal
62
63
  rexml
63
64
  crass (1.0.6)
64
65
  csl (2.0.0)
@@ -66,7 +67,7 @@ GEM
66
67
  rexml
67
68
  csl-styles (2.0.1)
68
69
  csl (~> 2.0)
69
- diff-lcs (1.5.0)
70
+ diff-lcs (1.5.1)
70
71
  docile (1.4.0)
71
72
  domain_name (0.6.20240107)
72
73
  drb (2.2.0)
@@ -154,7 +155,7 @@ GEM
154
155
  iniparser (>= 0.1.0)
155
156
  public_suffix (4.0.7)
156
157
  racc (1.7.3)
157
- rack (3.0.8)
158
+ rack (3.0.9)
158
159
  rack-test (2.1.0)
159
160
  rack (>= 1.3)
160
161
  rainbow (3.1.1)
@@ -25,8 +25,8 @@ module Commonmeta
25
25
  "Researcher" => "Other",
26
26
  "Sponsor" => "Other",
27
27
  "Supervisor" => "Supervision",
28
- "WorkPackageLeader" => "Other"
29
- }
28
+ "WorkPackageLeader" => "Other",
29
+ }
30
30
 
31
31
  def get_one_author(author)
32
32
  # basic sanity checks
@@ -55,20 +55,20 @@ module Commonmeta
55
55
  parse_attributes(author.fetch("identifier", nil), first: true) ||
56
56
  parse_attributes(author.fetch("sameAs", nil), first: true)
57
57
  id = normalize_orcid(id) || normalize_ror(id) if id.present?
58
-
58
+
59
59
  # DataCite metadata
60
60
  if id.nil? && author["nameIdentifiers"].present?
61
61
  id = Array.wrap(author.dig("nameIdentifiers")).find do |ni|
62
62
  normalize_name_identifier(ni).present?
63
63
  end
64
64
  id = normalize_name_identifier(id) if id.present?
65
- # Crossref metadata
65
+ # Crossref metadata
66
66
  elsif id.nil? && author["ORCID"].present?
67
67
  id = author.fetch("ORCID")
68
68
  id = normalize_orcid(id)
69
- # JSON Feed metadata
69
+ # JSON Feed metadata
70
70
  elsif id.nil? && author["url"].present?
71
- id = author.fetch("url")
71
+ id = author.fetch("url")
72
72
  end
73
73
 
74
74
  # parse author type, i.e. "Person", "Organization" or not specified
@@ -168,6 +168,9 @@ module Commonmeta
168
168
  # check if a name has only one word, e.g. "FamousOrganization", not including commas
169
169
  return false if name.to_s.split(" ").size == 1 && name.to_s.exclude?(",")
170
170
 
171
+ # check if name contains words known to be used in organization names
172
+ return false if %w[University College Institute School Center Department Laboratory Library Museum Foundation Society Association Company Corporation Collaboration Consortium Incorporated Inc. Institut Research Science].any? { |word| name.to_s.include?(word) }
173
+
171
174
  # check for suffixes, e.g. "John Smith, MD"
172
175
  return true if name && %w[MD PhD].include?(name.split(", ").last)
173
176
 
@@ -4,29 +4,29 @@ module Commonmeta
4
4
  module Readers
5
5
  module DataciteReader
6
6
  def get_datacite(id: nil, **options)
7
- return { 'string' => nil, 'state' => 'not_found' } unless id.present?
7
+ return { "string" => nil, "state" => "not_found" } unless id.present?
8
8
 
9
9
  api_url = datacite_api_url(id, options)
10
10
  response = HTTP.get(api_url)
11
- return { 'string' => nil, 'state' => 'not_found' } unless response.status.success?
11
+ return { "string" => nil, "state" => "not_found" } unless response.status.success?
12
12
 
13
13
  body = JSON.parse(response.body)
14
- client = Array.wrap(body.fetch('included', nil)).find do |m|
15
- m['type'] == 'clients'
14
+ client = Array.wrap(body.fetch("included", nil)).find do |m|
15
+ m["type"] == "clients"
16
16
  end
17
- client_id = client.to_h.fetch('id', nil)
18
- provider_id = Array.wrap(client.to_h.fetch('relationships', nil)).find do |m|
19
- m['provider'].present?
20
- end.to_h.dig('provider', 'data', 'id')
21
-
22
- { 'string' => response.body.to_s,
23
- 'provider_id' => provider_id,
24
- 'client_id' => client_id }
17
+ client_id = client.to_h.fetch("id", nil)
18
+ provider_id = Array.wrap(client.to_h.fetch("relationships", nil)).find do |m|
19
+ m["provider"].present?
20
+ end.to_h.dig("provider", "data", "id")
21
+
22
+ { "string" => response.body.to_s,
23
+ "provider_id" => provider_id,
24
+ "client_id" => client_id }
25
25
  end
26
26
 
27
27
  def read_datacite(string: nil, **_options)
28
28
  errors = jsonlint(string)
29
- return { 'errors' => errors } if errors.present?
29
+ return { "errors" => errors } if errors.present?
30
30
 
31
31
  read_options = ActiveSupport::HashWithIndifferentAccess.new(_options.except(:doi, :id, :url,
32
32
  :sandbox, :validate, :ra))
@@ -34,140 +34,146 @@ module Commonmeta
34
34
  meta = string.present? ? JSON.parse(string) : {}
35
35
 
36
36
  # optionally strip out the message wrapper from API
37
- meta = meta.dig('data', 'attributes') if meta.dig('data').present?
37
+ meta = meta.dig("data", "attributes") if meta.dig("data").present?
38
38
 
39
39
  meta.transform_keys!(&:underscore)
40
40
 
41
- id = normalize_doi(meta.fetch('doi', nil))
41
+ id = normalize_doi(meta.fetch("doi", nil))
42
42
 
43
- resource_type_general = meta.dig('types', 'resourceTypeGeneral')
44
- resource_type = meta.dig('types', 'resourceType')
43
+ resource_type_general = meta.dig("types", "resourceTypeGeneral")
44
+ resource_type = meta.dig("types", "resourceType")
45
45
  # if resource_type is one of the new resource_type_general types introduced in schema 4.3, use it
46
46
  type = Commonmeta::Utils::DC_TO_CM_TRANSLATIONS.fetch(resource_type, nil) ||
47
- Commonmeta::Utils::DC_TO_CM_TRANSLATIONS.fetch(resource_type_general, 'Other')
47
+ Commonmeta::Utils::DC_TO_CM_TRANSLATIONS.fetch(resource_type_general, "Other")
48
48
 
49
- alternate_identifiers = Array.wrap(meta.fetch('alternate_identifiers', nil)).map do |i|
49
+ alternate_identifiers = Array.wrap(meta.fetch("alternate_identifiers", nil)).map do |i|
50
50
  i.transform_keys! { |k| k.camelize(:lower) }
51
51
  end
52
- url = meta.fetch('url', nil)
53
- titles = Array.wrap(meta.fetch('titles', nil)).map do |title|
54
- title.compact
52
+ url = meta.fetch("url", nil)
53
+ titles = Array.wrap(meta.fetch("titles", nil)).map do |title|
54
+ { "title" => title.fetch("title", nil),
55
+ "type" => title.fetch("titleType", nil),
56
+ "language" => title.fetch("lang", nil) }.compact
55
57
  end
56
- contributors = get_authors(from_datacite(meta.fetch('creators', nil)))
57
- contributors += get_authors(from_datacite(meta.fetch('contributors', nil)))
58
- if meta.fetch('publisher', nil).is_a?(Hash)
59
- publisher = { 'name' => meta.fetch('publisher', nil).fetch('name', nil) }
60
- elsif meta.fetch('publisher', nil).is_a?(String)
61
- publisher = { 'name' => meta.fetch('publisher', nil) }
58
+ contributors = get_authors(from_datacite(meta.fetch("creators", nil)))
59
+ contributors += get_authors(from_datacite(meta.fetch("contributors", nil)))
60
+ if meta.fetch("publisher", nil).is_a?(Hash)
61
+ publisher = { "name" => meta.fetch("publisher", nil).fetch("name", nil) }
62
+ elsif meta.fetch("publisher", nil).is_a?(String)
63
+ publisher = { "name" => meta.fetch("publisher", nil) }
62
64
  else
63
65
  publisher = nil
64
66
  end
65
67
 
66
- container = meta.fetch('container', nil)
67
- funding_references = meta.fetch('funding_references', nil)
68
+ container = meta.fetch("container", nil)
69
+ funding_references = meta.fetch("funding_references", nil)
68
70
 
69
71
  date = {}
70
- date['created'] =
71
- get_iso8601_date(meta.dig('created')) || get_date(meta.dig('dates'), 'Created')
72
- date['published'] =
73
- get_iso8601_date(meta.dig('published')) || get_date(meta.dig('dates'),
74
- 'Issued') || get_iso8601_date(meta.dig('publication_year'))
75
- date['registered'] = get_iso8601_date(meta.dig('registered'))
76
- date['updated'] =
77
- get_iso8601_date(meta.dig('updated')) || get_date(meta.dig('dates'), 'Updated')
78
-
79
- descriptions = Array.wrap(meta.fetch('descriptions', nil)).map do |description|
80
- description.compact
72
+ date["created"] =
73
+ get_iso8601_date(meta.dig("created")) || get_date(meta.dig("dates"), "Created")
74
+ date["published"] =
75
+ get_iso8601_date(meta.dig("published")) || get_date(meta.dig("dates"),
76
+ "Issued") || get_iso8601_date(meta.dig("publication_year"))
77
+ date["registered"] = get_iso8601_date(meta.dig("registered"))
78
+ date["updated"] =
79
+ get_iso8601_date(meta.dig("updated")) || get_date(meta.dig("dates"), "Updated")
80
+
81
+ descriptions = Array.wrap(meta.fetch("descriptions", nil)).map do |description|
82
+ description_type = description.fetch("descriptionType", nil)
83
+ description_type = "Other" unless %w[Abstract Methods TechnicalInfo].include?(description_type)
84
+ { "description" => description.fetch("description", nil),
85
+ "type" => description_type,
86
+ "language" => description.fetch("lang", nil) }.compact
81
87
  end
82
- license = Array.wrap(meta.fetch('rights_list', nil)).find do |r|
83
- r['rightsUri'].present?
88
+ license = Array.wrap(meta.fetch("rights_list", nil)).find do |r|
89
+ r["rightsUri"].present?
84
90
  end
85
- license = hsh_to_spdx('rightsURI' => license['rightsUri']) if license.present?
86
- version = meta.fetch('version', nil)
87
- subjects = meta.fetch('subjects', nil)
88
- language = meta.fetch('language', nil)
89
- geo_locations = meta.fetch('geo_locations', nil)
90
- references = (Array.wrap(meta.fetch('related_identifiers',
91
- nil)) + Array.wrap(meta.fetch('related_items',
91
+ license = hsh_to_spdx("rightsURI" => license["rightsUri"]) if license.present?
92
+ version = meta.fetch("version", nil)
93
+ subjects = meta.fetch("subjects", nil)
94
+ language = meta.fetch("language", nil)
95
+ geo_locations = meta.fetch("geo_locations", nil)
96
+ references = (Array.wrap(meta.fetch("related_identifiers",
97
+ nil)) + Array.wrap(meta.fetch("related_items",
92
98
  nil))).select do |r|
93
- %w[References Cites IsSupplementedBy].include?(r['relationType'])
94
- end.map do |reference|
99
+ %w[References Cites IsSupplementedBy].include?(r["relationType"])
100
+ end.map do |reference|
95
101
  get_datacite_reference(reference)
96
102
  end
97
- files = Array.wrap(meta.fetch("content_url", nil)).map { |file| { "url" => file } }
98
- formats = meta.fetch('formats', nil)
99
- sizes = meta.fetch('sizes', nil)
100
- schema_version = meta.fetch('schema_version', nil) || 'http://datacite.org/schema/kernel-4'
101
- state = id.present? || read_options.present? ? 'findable' : 'not_found'
102
-
103
- { 'id' => id,
104
- 'type' => type,
105
- 'additional_type' => resource_type == type ? nil : resource_type,
106
- 'url' => url,
107
- 'titles' => titles,
108
- 'contributors' => contributors,
109
- 'container' => container,
110
- 'publisher' => publisher,
111
- 'provider' => 'DataCite',
112
- 'alternate_identifiers' => alternate_identifiers.presence,
113
- 'references' => references,
114
- 'funding_references' => funding_references,
115
- 'files' => files.presence,
116
- 'date' => date.compact,
117
- 'descriptions' => descriptions,
118
- 'license' => license,
119
- 'version' => version,
120
- 'subjects' => subjects,
121
- 'language' => language,
122
- 'geo_locations' => geo_locations,
123
- 'formats' => formats,
124
- 'sizes' => sizes,
125
- 'state' => state }.compact # .merge(read_options)
103
+ files = Array.wrap(meta.fetch("content_url", nil)).map { |file| { "url" => file } }
104
+ formats = meta.fetch("formats", nil)
105
+ sizes = meta.fetch("sizes", nil)
106
+ schema_version = meta.fetch("schema_version", nil) || "http://datacite.org/schema/kernel-4"
107
+ state = id.present? || read_options.present? ? "findable" : "not_found"
108
+
109
+ { "id" => id,
110
+ "type" => type,
111
+ "additional_type" => resource_type == type ? nil : resource_type,
112
+ "url" => url,
113
+ "titles" => titles,
114
+ "contributors" => contributors,
115
+ "container" => container,
116
+ "publisher" => publisher,
117
+ "provider" => "DataCite",
118
+ "alternate_identifiers" => alternate_identifiers.presence,
119
+ "references" => references,
120
+ "funding_references" => funding_references,
121
+ "files" => files.presence,
122
+ "date" => date.compact,
123
+ "descriptions" => descriptions,
124
+ "license" => license,
125
+ "version" => version,
126
+ "subjects" => subjects,
127
+ "language" => language,
128
+ "geo_locations" => geo_locations,
129
+ "formats" => formats,
130
+ "sizes" => sizes,
131
+ "state" => state }.compact # .merge(read_options)
126
132
  end
127
133
 
128
134
  def format_contributor(contributor)
129
- type = contributor.fetch('nameType', nil)
130
-
131
- { 'name' => type == 'Person' ? nil : contributor.fetch('name', nil),
132
- 'type' => type,
133
- 'givenName' => contributor.fetch('givenName', nil),
134
- 'familyName' => contributor.fetch('familyName', nil),
135
- 'nameIdentifiers' => contributor.fetch('nameIdentifiers', nil).presence,
136
- 'affiliations' => contributor.fetch('affiliations', nil).presence,
137
- 'contributorType' => contributor.fetch('contributorType', nil) }.compact
135
+ type = contributor.fetch("nameType", nil)
136
+
137
+ { "name" => type == "Person" ? nil : contributor.fetch("name", nil),
138
+ "type" => type,
139
+ "givenName" => contributor.fetch("givenName", nil),
140
+ "familyName" => contributor.fetch("familyName", nil),
141
+ "nameIdentifiers" => contributor.fetch("nameIdentifiers", nil).presence,
142
+ "affiliations" => contributor.fetch("affiliations", nil).presence,
143
+ "contributorType" => contributor.fetch("contributorType", nil) }.compact
138
144
  end
139
145
 
140
146
  def get_datacite_reference(reference)
141
147
  return nil unless reference.present? || !reference.is_a?(Hash)
142
148
 
143
- key = reference['relatedIdentifier']
149
+ key = reference["relatedIdentifier"]
144
150
  doi = nil
145
151
  url = nil
146
152
 
147
- case reference['relatedIdentifierType']
148
- when 'DOI'
149
- doi = normalize_doi(reference['relatedIdentifier'])
150
- when 'URL'
151
- url = reference['relatedIdentifier']
153
+ case reference["relatedIdentifierType"]
154
+ when "DOI"
155
+ doi = normalize_doi(reference["relatedIdentifier"])
156
+ when "URL"
157
+ url = reference["relatedIdentifier"]
152
158
  else
153
- url = reference['relatedIdentifier']
159
+ url = reference["relatedIdentifier"]
154
160
  end
155
161
 
156
162
  {
157
- 'key' => key,
158
- 'doi' => doi,
159
- 'url' => url,
160
- 'contributor' => reference.dig('author'),
161
- 'title' => reference.dig('article-title'),
162
- 'publisher' => reference.dig('publisher'),
163
- 'publicationYear' => reference.dig('year'),
164
- 'volume' => reference.dig('volume'),
165
- 'issue' => reference.dig('issue'),
166
- 'firstPage' => reference.dig('first-page'),
167
- 'lastPage' => reference.dig('last-page'),
168
- 'containerTitle' => reference.dig('journal-title'),
169
- 'edition' => nil,
170
- 'unstructured' => doi.nil? ? reference.dig('unstructured') : nil
163
+ "key" => key,
164
+ "doi" => doi,
165
+ "url" => url,
166
+ "contributor" => reference.dig("author"),
167
+ "title" => reference.dig("article-title"),
168
+ "publisher" => reference.dig("publisher"),
169
+ "publicationYear" => reference.dig("year"),
170
+ "volume" => reference.dig("volume"),
171
+ "issue" => reference.dig("issue"),
172
+ "firstPage" => reference.dig("first-page"),
173
+ "lastPage" => reference.dig("last-page"),
174
+ "containerTitle" => reference.dig("journal-title"),
175
+ "edition" => nil,
176
+ "unstructured" => doi.nil? ? reference.dig("unstructured") : nil,
171
177
  }.compact
172
178
  end
173
179
  end
@@ -5,7 +5,7 @@ require "pathname"
5
5
 
6
6
  module Commonmeta
7
7
  module SchemaUtils
8
- COMMONMETA = File.read(File.expand_path("../../resources/commonmeta_v0.10.6.json",
8
+ COMMONMETA = File.read(File.expand_path("../../resources/commonmeta_v0.10.7.json",
9
9
  __dir__))
10
10
 
11
11
  def json_schema_errors
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Commonmeta
4
- VERSION = '3.11.0'
4
+ VERSION = '3.12.0'
5
5
  end
@@ -8,4 +8,4 @@ module Commonmeta
8
8
  end
9
9
  end
10
10
  end
11
- end
11
+ end
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "$schema": "http://json-schema.org/draft-07/schema#",
3
- "$id": "https://commonmeta.org/commonmeta_v0.10.6.json",
4
- "title": "Commonmeta v0.10.6",
3
+ "$id": "https://commonmeta.org/commonmeta_v0.10.7.json",
4
+ "title": "Commonmeta v0.10.7",
5
5
  "description": "JSON representation of the Commonmeta schema.",
6
6
  "additionalProperties": false,
7
7
  "definitions": {
@@ -253,6 +253,10 @@
253
253
  "description": "The type of the title.",
254
254
  "type": "string",
255
255
  "enum": ["AlternativeTitle", "Subtitle", "TranslatedTitle"]
256
+ },
257
+ "language": {
258
+ "description": "The language of the title. Use one of the language codes from the IETF BCP 47 standard.",
259
+ "type": "string"
256
260
  }
257
261
  },
258
262
  "required": ["title"]
@@ -424,7 +428,11 @@
424
428
  "type": {
425
429
  "description": "The type of the description.",
426
430
  "type": "string",
427
- "enum": ["Abstract", "Description", "Summary"]
431
+ "enum": ["Abstract", "Summary", "Methods", "TechnicalInfo", "Other"]
432
+ },
433
+ "language": {
434
+ "description": "The language of the title. Use one of the language codes from the IETF BCP 47 standard.",
435
+ "type": "string"
428
436
  }
429
437
  },
430
438
  "required": ["description"]
@@ -58,6 +58,16 @@ describe Commonmeta::Metadata, vcr: true do
58
58
  author = { "name" => "Tejas S. Sathe, MD" }
59
59
  expect(subject.is_personal_name?(name: author["name"])).to be true
60
60
  end
61
+
62
+ it "name with organization string" do
63
+ author = { "name" => "University of California, Santa Barbara" }
64
+ expect(subject.is_personal_name?(name: author["name"])).to be false
65
+ end
66
+
67
+ it "name with another organization string" do
68
+ author = { "name" => "Research Graph" }
69
+ expect(subject.is_personal_name?(name: author["name"])).to be false
70
+ end
61
71
  end
62
72
 
63
73
  context "cleanup_author" do
@@ -0,0 +1,317 @@
1
+ ---
2
+ http_interactions:
3
+ - request:
4
+ method: get
5
+ uri: https://api.rogue-scholar.org/posts/05f01f68-ef81-47d7-a3c1-40aba91d358f
6
+ body:
7
+ encoding: ASCII-8BIT
8
+ string: ''
9
+ headers:
10
+ Connection:
11
+ - close
12
+ Host:
13
+ - api.rogue-scholar.org
14
+ User-Agent:
15
+ - http.rb/5.1.1
16
+ response:
17
+ status:
18
+ code: 200
19
+ message: OK
20
+ headers:
21
+ Content-Type:
22
+ - application/json
23
+ Content-Length:
24
+ - '23886'
25
+ Ratelimit-Limit:
26
+ - '15'
27
+ Ratelimit-Remaining:
28
+ - '14'
29
+ Ratelimit-Reset:
30
+ - '3'
31
+ Date:
32
+ - Wed, 31 Jan 2024 19:50:01 GMT
33
+ Server:
34
+ - Fly/ba9e227a (2024-01-26)
35
+ Via:
36
+ - 1.1 fly.io
37
+ Fly-Request-Id:
38
+ - 01HNGH4EZV3XQF20H1PZ6X5N07-fra
39
+ body:
40
+ encoding: UTF-8
41
+ string: '{"abstract":null,"archive_url":null,"authors":[{"name":"Research Graph"}],"blog":{"api":false,"archive_prefix":null,"authors":null,"backlog":0,"canonical_url":null,"category":"computerAndInformationSciences","created_at":1706685423,"current_feed_url":null,"description":"Stories
42
+ by Research Graph on Medium","favicon":"https://cdn-images-1.medium.com/fit/c/150/150/1*laJi0jBkVoGhXid7gD_DmQ.png","feed_format":"application/rss+xml","feed_url":"https://medium.com/@researchgraph/feed","filter":null,"funding":null,"generator":"Medium","generator_raw":"Medium","home_page_url":"https://medium.com/@researchgraph","id":"30da2ca9-8258-4ab5-acca-3919d9a5d98d","indexed":true,"issn":null,"language":"en","license":"https://creativecommons.org/licenses/by/4.0/legalcode","mastodon":"","plan":"Starter","prefix":"10.59350","relative_url":null,"ror":null,"secure":true,"slug":"researchgraph","status":"active","title":"Research
43
+ Graph","updated_at":1706151454,"use_api":null,"use_mastodon":false,"user_id":"a7e16958-1175-437c-b839-d4b8a47ec811","version":"https://jsonfeed.org/version/1.1"},"blog_name":"Research
44
+ Graph","blog_slug":"researchgraph","content_text":"**Tools and Platform for
45
+ Integration of Knowledge Graph with RAG\npipelines.**\n\n<figure>\n<img\nsrc=\"https://cdn-images-1.medium.com/max/1024/1*bJ3eWZ7301vYDzBomwdLfQ.png\"\nalt=\"Complex
46
+ network connected to books and showing information from magespace\" />\n<figcaption>Image
47
+ Created in <a\nhref=\"https://www.mage.space/\">https://www.mage.space/</a></figcaption>\n</figure>\n\nAuthors:
48
+ [Aland\nAstudillo](https://www.linkedin.com/in/aland-astudillo/), [Aishwarya\nNambissan](https://www.linkedin.com/in/aishwarya-nambissan-127229200/)\n\nMany
49
+ users of chatbots such as ChatGPT, have encountered the problem of\nreceiving
50
+ inappropriate or incompatible responses. There are several\nreasons why this
51
+ might\u00a0happen.\n\nOne reason is the lack of appropriate training data,
52
+ as chatbots are\nusually trained on large amounts of text and code. If the
53
+ data is\ninsufficient or of poor quality, the chatbot may misunderstand queries\nand
54
+ provide inaccurate responses. Another reason is that some chatbots\nare designed
55
+ for specific tasks or domains, which limits their ability\nto handle broader
56
+ queries or understand subtle nuances in conversation.\nAdditionally, chatbots
57
+ may struggle with natural language, which is\ncomplex and often ambiguous.
58
+ This can cause them to misunderstand a\nuser''s query and provide irrelevant
59
+ or off-topic responses. Finally,\nthere are technical limitations, such as
60
+ the chatbot''s inability to\nreason or make inferences.\n\nThis article explores
61
+ a potential solution by combining two influential\napproaches in the field
62
+ of Natural Language Processing\u200a---\u200aRetrieval\nAugmented Generation
63
+ (**RAG**) and Knowledge Graphs(**KGs**). We will\ndelve into the partnership
64
+ between these two entities, discuss the\nnotable technologies and software
65
+ used in their processes, and highlight\nvarious options for utilizing their
66
+ combined potential.\n\n### **RAG**\n\nRetrieval-Augmented Generation is the
67
+ process of optimizing the output\nof a large language model using a knowledge
68
+ base outside of its training\ndata sources before generating a response. It
69
+ takes an input and\nretrieves a set of relevant/supporting documents given
70
+ a source (e.g.,\nWikipedia). This can be thought of as a Large Language Model
71
+ (LLM) not\njust putting words together, but carefully selecting relevant\ninformation
72
+ from external sources and Knowledge Graphs to create\nwell-informed and detailed
73
+ responses.\n\n### RAG Retrieval Techniques\n\nThe following are some crucial
74
+ technologies that enable RAG''s impressive\nability to retrieve and incorporate
75
+ relevant information:\n\n**Vector Search**: It transforms text into numerical
76
+ vectors, capturing\ntheir meaning and nuances in a mathematical space, creating
77
+ a map of\nrelationships. Similar texts, like those discussing shared topics
78
+ or\nusing similar language, end up positioned close together in this space,\nallowing
79
+ vector search to quickly identify them as related. This allows\nlightning-fast
80
+ comparisons, finding similar texts based on meaning, not\njust keywords.\n\nAlgorithms
81
+ like [**Faiss**](https://github.com/facebookresearch/faiss)\nand [**Annoy**](https://github.com/spotify/annoy)
82
+ map text into dense\nvectors, enabling fast comparisons and retrieval of relevant
83
+ passages\nbased on semantic similarity.\n\n**Passage Ranking**: It is an internal
84
+ algorithm that scores candidate\ntext passages based on their relevance to
85
+ a query. It considers factors\nlike keyword frequency, keyword overlap, and
86
+ document structure to act\nlike a judge, sifting through information to select
87
+ the most fitting and\ninformative passages.\n\nKeyword overlap measures how
88
+ often the same keywords appear in **both**\nthe query and the candidate passage,
89
+ emphasizing shared vocabulary and\npotential relevance. It differs from keyword
90
+ frequency, which simply\ncounts how often individual keywords appear within
91
+ a passage, regardless\nof their presence in the\u00a0query.\n\nTechniques
92
+ like [**BM25**](https://github.com/getalp/wikIR) and\n[**TF-IDF**](https://github.com/marcocor/wikipedia-idf)
93
+ score candidate\npassages based on keyword overlap and frequency, ensuring
94
+ retrieved\ninformation truly fits the\u00a0context.\n\n**Graph Neural Networks**
95
+ (**GNNs**): They are neural networks designed\nto explore and learn from interconnected
96
+ data like maps, social\nnetworks, and other complex relationships. Unlike
97
+ traditional processing\nmethods that go through data in a linear fashion,
98
+ GNNs are capable of\nrecognizing hidden patterns and understanding relationships
99
+ like \"who\nknows who\" and \"what connects to what\" by \"hopping\" across
100
+ connections\nin\u00a0data.\n\nConsider a graph as a network of dots(nodes)
101
+ connected by lines (edges).\nEach dot represents some information, like a
102
+ person, object, or concept.\nThe lines tell you how these things relate to
103
+ each\u00a0other.\n\nGNNs work in rounds. In each\u00a0round:\n\n1. Message
104
+ Passing: Each node \"talks\" to its neighbors, sending\n messages along
105
+ the edges. These messages contain information about\n the node itself and
106
+ its features.\n2. Node Update: Each node receives messages from all its neighbors
107
+ and\n combines them with its own information. This update can involve\n calculations
108
+ and applying a special function.\n3. Output Calculation: Based on the updated
109
+ information, the network\n calculates an output for each node. This output
110
+ could be a\n prediction about the node''s category, its relationship to
111
+ another\n node, or some other relevant information.\n\nThis process repeats
112
+ for multiple rounds, allowing nodes to incorporate\ninformation from their
113
+ entire neighborhood, not just their direct\nneighbors. As the rounds progress,
114
+ the network learns to understand the\nrelationships between nodes and the
115
+ overall structure of the\u00a0graph.\n\nWhen dealing with Knowledge Graphs,
116
+ frameworks like\n[**PyTorch-Geometric**](https://readthedocs.org/projects/pytorch-geometric/)\nand
117
+ [**DeepMind''s\nGNN**](https://github.com/deepmind/deepmind-research/blob/master/learning_to_simulate/graph_network.py)\nlibrary
118
+ come into play. These frameworks allow GNNs to traverse\ninterconnected entities
119
+ and relationships within the graph, retrieve\nrelevant knowledge fragments,
120
+ and understand complex connections.\n\n### **Knowledge Graphs: The Structured
121
+ Wisdom\u00a0Library**\n\nA knowledge graph, also referred to as a semantic
122
+ network, is a\nstructure that represents a network of real-world entities
123
+ such as\nobjects, events, situations, or concepts. It helps to illustrate
124
+ the\nconstantly changing representations of the world, connecting entities\n(such
125
+ as \"Marie Curie\") and relationships (such as \"won Nobel Prize\") to\nform
126
+ a complex network of information. This information is typically\nstored in
127
+ a graph database and visualized as a graph structure, thus the\nterm knowledge
128
+ \"graph\".\n\nKGs go beyond simply finding relevant facts and delve deeper
129
+ into\nunderstanding the relationships and insights hidden within using these\nprocesses:\n\n**Entity
130
+ Linking**: Imagine a vast network of information, like a big\npuzzle of dots.
131
+ Now imagine trying to connect specific names, places,\nand concepts to their
132
+ corresponding dots in the puzzle. That is what\nentity linking does with text
133
+ and knowledge graphs, connecting the\nspecific components of the text to the
134
+ corresponding nodes in the graph.\nThey help systems understand the exact
135
+ meaning of entities, and find\nrelevant information from the\u00a0graph.\n\nLibraries
136
+ like [**DGL-KeLP**](https://github.com/awslabs/dgl-ke)\nleverage GNNs to identify
137
+ and link named entities (like \"Marie Curie\")\nto their respective nodes
138
+ within the Knowledge Graphs, enabling RAG to\nretrieve information that is
139
+ directly relevant to the core subject of a\nsearch\u00a0query\n\n**Path Mining**:
140
+ Path mining is a process of uncovering hidden\nrelationships and patterns
141
+ that are not easily noticeable. It involves\nexploring complicated networks
142
+ of information and identifying and\ntracing connections between entities that
143
+ may seem unrelated. By doing\nso, path mining reveals surprising insights
144
+ and useful knowledge,\nimproving our understanding of the complex structures
145
+ within knowledge\ngraphs.\n\nTools like [**Neo4j**](https://neo4j.com/) and\n[**Stanza**](https://github.com/stanfordnlp/stanza)
146
+ allow traversing\npaths between entities, uncovering hidden relationships,
147
+ and generating\ninsightful responses based on this deeper understanding.\n\n**Reasoning
148
+ and Inference**: In the context of knowledge graphs,\nreasoning and inference
149
+ are not just limited to discovering facts; they\nare also concerned with utilizing
150
+ them effectively. This involves\nintegrating data, drawing meaningful connections,
151
+ and using logical\nreasoning to resolve issues, foresee future occurrences,
152
+ or even\nconstruct narratives leveraging the insights provided by the knowledge\ngraph.\n\nConsider
153
+ the scenario of trying to find an organization that works in\nspecific sectors
154
+ with the help of a knowledge graph. This analogy\neffectively highlights the
155
+ active role of reasoning and inference in\nknowledge graphs:\n\n1. Gathering
156
+ Facts: Knowledge graphs collect and organize information\n from various
157
+ sources, such as websites, databases, academic papers,\n and social media
158
+ platforms. These facts are represented as\n structured data, with entities
159
+ (e.g., organizations) and their\n attributes (e.g., sectors in which they
160
+ operate) forming nodes and\n edges in the graph. By combining data about
161
+ organizations and\n sectors, knowledge graphs enable the gathering of relevant
162
+ facts for\n analysis.\n2. Integrating information: By connecting an organization''s\n relationships
163
+ with specific sectors, such as partnerships,\n investments, or certifications,
164
+ knowledge graphs reveal the scope\n and relevance of their work within
165
+ those sectors. Links to related\n entities like employees, board members,
166
+ or projects can further\n contribute to understanding an organization''s
167
+ involvement in\n specific\u00a0sectors.\n3. Predicting and Creating: Knowledge
168
+ graphs can leverage machine\n learning and predictive models to infer missing
169
+ or hidden\n information. By analyzing the available facts and connections
170
+ within\n the graph, these models can predict an organization''s potential\n involvement
171
+ in sectors that have common attributes with their known\n areas of operation.
172
+ For example, if an organization has expertise in\n renewable energy, predictive
173
+ models could suggest their likely\n involvement in related sectors like
174
+ clean transportation or\n sustainable infrastructure. Additionally, knowledge
175
+ graphs\n facilitate the creation of new information and insights by combining\n existing
176
+ facts with external data sources. For instance, by\n integrating real-time
177
+ data on industry trends, market analysis, or\n news articles, knowledge
178
+ graphs enable the discovery of emerging\n sectors or upcoming organizations
179
+ that might align with the given\n parameters.\n\nA framework like [**Atomspace**](https://github.com/opencog/atomspace)\nfrom
180
+ [**OpenCog**](https://opencog.org/) empowers RAG to reason and\ninfer new
181
+ knowledge. By traversing paths and combining information from\ninterconnected
182
+ entities, the system can generate informed predictions or\nanswer hypothetical
183
+ questions.\n\n### Purpose\n\nThe combination of Retrieval-Augmented Generation
184
+ (RAG) and Knowledge\nGraphs (KG) is beneficial for several\u00a0reasons:\n\n1. **Enhanced
185
+ information retrieval**: Knowledge graphs provide\n structured and interconnected
186
+ information that can significantly\n improve the effectiveness of information
187
+ retrieval. By using KGs,\n RAG models can retrieve more accurate and relevant
188
+ information,\n leading to better generation and response\u00a0quality.\n2. **Reliable
189
+ and diverse information:** KGs are constructed from\n authoritative sources,
190
+ making them reliable and trustworthy sources\n of information. RAG models
191
+ can leverage this reliable information to\n generate more accurate responses.
192
+ Additionally, KGs help in\n diversifying the generated responses by providing
193
+ a broader pool of\n related facts and entities.\n3. **Context-aware understanding**:
194
+ KGs enable RAG models to understand\n and reason over the contextual information.
195
+ By leveraging the\n relationships and semantic connections encoded in KGs,
196
+ RAG models\n can better grasp the context of user queries or conversations,\n resulting
197
+ in more coherent and appropriate responses.\n4. **Handling complex queries**:
198
+ KGs allow RAG models to tackle complex\n queries by breaking them down
199
+ into smaller sub-queries, retrieving\n relevant pieces of information from
200
+ the KG, and then generating a\n response based on the retrieved knowledge.
201
+ This enables RAG models\n to handle a wide range of user queries effectively.\n5. **Explainability
202
+ and transparency**: KGs provide a transparent and\n interpretable representation
203
+ of knowledge. By integrating KG-based\n retrieval into RAG models, the
204
+ reasoning behind the generated\n responses becomes more explainable. Users
205
+ can have a clear\n understanding of the knowledge sources and connections
206
+ used to\n produce the response.\n6. **Scalability**: Knowledge graphs
207
+ act as large-scale repositories of\n information. RAG models can leverage
208
+ KGs to generate responses to\n various queries or conversations without
209
+ requiring additional\n supervised training data. This makes the RAG+KG
210
+ approach scalable to\n handle an extensive range of knowledge domains and
211
+ user\u00a0queries.\n\n### **Pipeline Possibilities: Orchestrating RAG and\u00a0KGs:**\n\nLet''s
212
+ explore some exciting pipeline options for harnessing the combined\npower
213
+ of RAG and Knowledge Graphs. There are two options in which either\nthe LLM
214
+ is prioritized or the Knowledge Graph is prioritized:\n\n**Option 1: LLM-Centric
215
+ Pipeline:**\n\nThe LLM-Centric pipeline is a RAG and Knowledge Graph combination
216
+ that\nempowers LLMs to craft well-informed responses. Here''s how it\u00a0works:\n\n1. Start
217
+ with the user''s question or statement\n2. The LLM (like GPT-3) generates
218
+ an initial draft response based on\n its internal knowledge. This draft
219
+ may lack specific factual details\n or nuances that a knowledge graph can\u00a0provide.\n3. RAG
220
+ kicks in, searching the text corpus or the Knowledge Graph for\n relevant
221
+ passages that enrich the draft. During the retrieval\n process, RAG retrieval
222
+ techniques are used to search not only text\n corpora but also knowledge
223
+ graphs to find relevant information. This\n means that RAG can directly
224
+ tap into the structured knowledge within\n the graph to retrieve facts,
225
+ relationships, and entities that align\n with the user''s query and the
226
+ LLM''s generated draft.\n4. The retrieved information is carefully fused
227
+ with the LLM''s output,\n creating a more factually accurate and insightful
228
+ response\n5. A final polishing step ensures the response is fluent, grammatically\n correct,
229
+ and ready to\u00a0show.\n\n<figure>\n<img\nsrc=\"https://cdn-images-1.medium.com/max/1024/0*3pd9MOIflkbS07wI\"
230
+ />\n<figcaption>RAG LLM-centric generic\u00a0scheme.</figcaption>\n</figure>\n\nThe
231
+ basic steps to perform this\u00a0are:\n\n1. **Pre-processing**: Clean and
232
+ tokenize user input to prepare for\n processing.\n2. **LLM Generation**:
233
+ Generate an initial draft response using an LLM\n like [**GPT-3**](https://openai.com/product)
234
+ or [**Jurassic-1\n Jumbo**](https://www.livescience.com/google-sentient-ai-lamda-lemoine).\n3. **Retrieval**:
235
+ Employ RAG techniques to retrieve relevant passages\n from a text corpus
236
+ or Knowledge Graphs.\n4. **Fusion**: Integrate retrieved information into
237
+ the LLM-generated\n draft, creating a more informed and factually-grounded
238
+ response.\n5. **Post-processing**: Refine the final response for fluency,\n grammatical
239
+ correctness, and overall coherence.\n\n**Option 2: Knowledge Graphs-Centric
240
+ Pipeline:**\n\nIn this approach, knowledge graphs take center stage. In essence,
241
+ this\npipeline prioritizes the structured knowledge within knowledge graphs,\nusing
242
+ RAG retrieval techniques to translate those insights into\ncompelling and
243
+ informative language. Here''s how it\u00a0unfolds:\n\n1. User input: The
244
+ process begins with the user''s question or statement\n2. Graph exploration:
245
+ The knowledge graph is meticulously explored to\n identify relevant entities,
246
+ relationships, and paths that align with\n the user''s input. This stage
247
+ involves techniques like entity\n linking, path mining, and reasoning to
248
+ uncover valuable information\n within the\u00a0graph\n3. Response planning:
249
+ The insights extracted from the graph are used to\n create a structured
250
+ response plan. This plan outlines the key\n points, facts, and logical
251
+ flow that the final response\n should\u00a0embody\n4. Language generation:
252
+ This is where RAG steps in. Its purpose is to\n create human-like text
253
+ that follows the response plan. It uses LLMs\n to produce well-written
254
+ sentences and paragraphs, combining the\n relevant information from the
255
+ knowledge graph while maintaining\n cohesiveness and readability.\n5. Post-processing:
256
+ The generated response undergoes a final refinement\n process to ensure
257
+ grammatical correctness, clarity, and\n overall\u00a0quality\n\n<figure>\n<img\nsrc=\"https://cdn-images-1.medium.com/max/1024/0*mZ83esKBjbPmCq_C\"
258
+ />\n<figcaption>RAG Knowledge Graph-centric generic\u00a0scheme.</figcaption>\n</figure>\n\nThe
259
+ basic steps\u00a0are:\n\n1. **Query Formulation**: Transform the user input
260
+ into a query\n suitable for Knowledge Graph''s exploration.\n2. **Knowledge
261
+ Graphs:** You can use either Neo4j or\n [NebulaGraph](https://www.nebula-graph.io/)
262
+ to implement a retrieval\n enhancement technique. This technique involves
263
+ utilizing a knowledge\n graph to illustrate the connections between entities
264
+ and\n relationships. Additionally, it incorporates a powerful language\n model
265
+ to improve the retrieval process.\n3. **Fact Selection**: Employ entity linking
266
+ and reasoning algorithms\n to select and prioritize the most relevant facts
267
+ based on the query\n and\u00a0context.\n4. **Natural Language Generation**
268
+ (**NLG**): Utilise specialized NLG\n models like\n [BART](https://research.facebook.com/publications/controllable-abstractive-summarization/)\n to
269
+ translate the extracted facts into a natural language response.\n5. **Refinement**:
270
+ Enhance the generated response for clarity and\n coherence.\n\n### **Unveiling
271
+ a Future of Intelligent Interaction**\n\nThe combination of RAG and Knowledge
272
+ Graphs goes beyond just being a\ntechnological fusion. It paves the way for
273
+ a future where the\ninteraction between humans and computers goes beyond simple
274
+ words and\nbecomes a more informed and refined form of communication. As these\ntechnologies
275
+ continue to develop, we can expect to witness a significant\ntransformation
276
+ in:\n\n- AI-powered assistants that answer your questions with the confidence\n of
277
+ a well-read friend, seamlessly combining relevant facts and\n insights gleaned
278
+ from Knowledge Graphs.\n- Next-generation search engines that go beyond keyword
279
+ matching,\n understanding the deeper meaning behind your queries and delivering\n comprehensive,
280
+ contextual results enriched with information from\n Knowledge Graphs.\n-
281
+ Creative writing tools that utilize RAG and Knowledge Graphs to\n generate
282
+ stories that are both factually accurate and full of\n unexpected plot twists
283
+ and character development, moving beyond\n clich\u00e9d patterns.\n\n###
284
+ **Conclusion**\n\nThe convergence of Retrieval Augmented Generation (RAG)
285
+ and Knowledge\nGraphs (KGs) brings about an exciting synergy in the world
286
+ of Natural\nLanguage Processing (NLP). RAG enhances the output of large language\nmodels
287
+ by carefully selecting relevant information from external sources\nand KGs,
288
+ allowing for well-informed and detailed responses. KGs, on the\nother hand,
289
+ provide a structured representation of real-world entities\nand their relationships,
290
+ enabling the exploration of hidden insights and\nthe discovery of complex
291
+ connections.\n\nThe integration of RAG and KGs opens up two pipeline possibilities.
292
+ The\nLLM-centric pipeline prioritizes the language model''s output, which
293
+ is\nthen enriched with information retrieved from KGs. The Knowledge\nGraphs-centric
294
+ pipeline, on the other hand, places KGs at the center,\nutilizing RAG techniques
295
+ to translate the structured insights into\ncompelling and informative language.\n\nWhile
296
+ integrating LLMs and a knowledge graph for content retrieval\nrequires careful
297
+ planning, the reward is significant. You can gain\naccess to hidden relationships
298
+ within information, ultimately leading to\nhigher-quality output information.\n\nTools
299
+ like **OpenAI**, **Langchain**, and **LlamaIndex** provide\nready-made pipelines
300
+ to integrate knowledge graphs (like **Neo4j**)\neasily. Meanwhile, open-source
301
+ LLMs like **Mistral**, **Llama**, and\n**Dolphin** are catching up to proprietary
302
+ models in performance, making\nthem attractive choices for building custom
303
+ architectures. This\nopen-source scenario allows for the exploration and examination
304
+ of\nvarious methods before fully committing to a particular technological\nframework.
305
+ So, it is crucial to evaluate your needs and choose the\napproach that best
306
+ fits your use\u00a0case.\n\n![](https://medium.com/_/stat?event=post.clientViewed&referrerSource=full_rss&postId=fc0a6900f7eb){width=\"1\"\nheight=\"1\"}\n","doi":"https://doi.org/10.59350/jhrs4-22440","guid":"https://medium.com/p/fc0a6900f7eb","id":"05f01f68-ef81-47d7-a3c1-40aba91d358f","image":"https://cdn-images-1.medium.com/max/1024/1*bJ3eWZ7301vYDzBomwdLfQ.png","indexed_at":1706690571,"language":"en","published_at":1705557796,"reference":[],"relationships":[],"summary":"<strong>\n
307
+ Tools and Platform for Integration of Knowledge Graph with RAG pipelines.\n</strong>\nAuthors:
308
+ Aland Astudillo, Aishwarya Nambissan Many users of chatbots such as ChatGPT,
309
+ have encountered the problem of receiving inappropriate or incompatible responses.
310
+ There are several reasons why this might\u00a0happen. One reason is the lack
311
+ of appropriate training data, as chatbots are usually trained on large amounts
312
+ of text and code.","tags":["Artificial-intelligence","Machine-learning","Retrieval-augmented","Knowledge-graph"],"title":"Unveiling
313
+ the Synergy: Retrieval Augmented Generation (RAG) Meets Knowledge Graphs","updated_at":1705557796,"url":"https://medium.com/@researchgraph/unveiling-the-synergy-retrieval-augmented-generation-rag-meets-knowledge-graphs-fc0a6900f7eb"}
314
+
315
+ '
316
+ recorded_at: Wed, 31 Jan 2024 19:50:01 GMT
317
+ recorded_with: VCR 6.2.0
@@ -90,7 +90,7 @@ describe Commonmeta::Metadata, vcr: true do
90
90
  "affiliation" => [{ "name" => "Тверская государственная сельскохозяйственная академия" }], "familyName" => "Ганичева", "givenName" => "А.В.", "type" => "Person", "contributorRoles" => ["Author"],
91
91
  )
92
92
  expect(subject.titles.last).to eq("title" => "MODEL OF SYSTEM DYNAMICS OF PROCESS OF TRAINING",
93
- "titleType" => "TranslatedTitle")
93
+ "type" => "TranslatedTitle")
94
94
  expect(subject.date).to eq("created" => "2019-02-12", "published" => "2019",
95
95
  "registered" => "2019-02-12", "updated" => "2022-08-23")
96
96
  expect(subject.publisher).to eq("name" => "МОДЕЛИРОВАНИЕ, ОПТИМИЗАЦИЯ И ИНФОРМАЦИОННЫЕ ТЕХНОЛОГИИ")
@@ -114,10 +114,14 @@ describe Commonmeta::Metadata, vcr: true do
114
114
  expect(subject.contributors.first).to eq(
115
115
  "name" => "Europäische Kommission", "contributorRoles" => ["Author"], "type" => "Organization",
116
116
  )
117
- expect(subject.titles).to eq([
118
- { "lang" => "de",
119
- "title" => "Flash Eurobarometer 54 (Madrid Summit)" }, { "lang" => "en", "title" => "Flash Eurobarometer 54 (Madrid Summit)" }, { "titleType" => "Subtitle", "lang" => "de", "title" => "The Common European Currency" }, { "titleType" => "Subtitle", "lang" => "en", "title" => "The Common European Currency" },
120
- ])
117
+ expect(subject.titles).to eq([{ "language" => "de", "title" => "Flash Eurobarometer 54 (Madrid Summit)" },
118
+ { "language" => "en", "title" => "Flash Eurobarometer 54 (Madrid Summit)" },
119
+ { "language" => "de",
120
+ "title" => "The Common European Currency",
121
+ "type" => "Subtitle" },
122
+ { "language" => "en",
123
+ "title" => "The Common European Currency",
124
+ "type" => "Subtitle" }])
121
125
  expect(subject.subjects).to eq([{ "lang" => "en",
122
126
  "subject" => "KAT12 International Institutions, Relations, Conditions",
123
127
  "subjectScheme" => "ZA" },
@@ -163,14 +167,39 @@ describe Commonmeta::Metadata, vcr: true do
163
167
  expect(subject.contributors.length).to eq(23)
164
168
  expect(subject.contributors[0]).to eq("contributorRoles" => ["Author"], "familyName" => "ExampleFamilyName", "givenName" => "ExampleGivenName", "type" => "Person")
165
169
  expect(subject.contributors[2]).to eq("contributorRoles" => ["ContactPerson"], "familyName" => "ExampleFamilyName", "givenName" => "ExampleGivenName", "type" => "Person")
166
- expect(subject.date).to eq("created"=>"2022-10-27", "published"=>"2022", "registered"=>"2022-10-27", "updated"=>"2024-01-02")
170
+ expect(subject.date).to eq("created" => "2022-10-27", "published" => "2022", "registered" => "2022-10-27", "updated" => "2024-01-02")
167
171
  expect(subject.publisher).to eq("name" => "Example Publisher")
168
- expect(subject.license).to eq("id"=>"CC-PDDC", "url"=>"https://creativecommons.org/licenses/publicdomain/")
172
+ expect(subject.titles).to eq([{ "language" => "en", "title" => "Example Title" },
173
+ { "language" => "en", "title" => "Example Subtitle", "type" => "Subtitle" },
174
+ { "language" => "fr",
175
+ "title" => "Example TranslatedTitle",
176
+ "type" => "TranslatedTitle" },
177
+ { "language" => "en",
178
+ "title" => "Example AlternativeTitle",
179
+ "type" => "AlternativeTitle" }])
180
+ expect(subject.descriptions).to eq([{ "description" => "Example Abstract",
181
+ "type" => "Abstract",
182
+ "language" => "en" },
183
+ { "description" => "Example Methods",
184
+ "type" => "Methods",
185
+ "language" => "en" },
186
+ { "description" => "Example SeriesInformation",
187
+ "type" => "Other",
188
+ "language" => "en" },
189
+ { "description" => "Example TableOfContents",
190
+ "type" => "Other",
191
+ "language" => "en" },
192
+ { "description" => "Example TechnicalInfo",
193
+ "type" => "TechnicalInfo",
194
+ "language" => "en" },
195
+ { "description" => "Example Other", "type" => "Other", "language" => "en" }])
196
+ expect(subject.license).to eq("id" => "CC-PDDC", "url" => "https://creativecommons.org/licenses/publicdomain/")
169
197
  end
170
198
 
171
199
  it "instrument" do
172
200
  input = "#{fixture_path}datacite-instrument.json"
173
201
  subject = described_class.new(input: input)
202
+ puts subject.errors unless subject.valid?
174
203
  expect(subject.valid?).to be true
175
204
  expect(subject.id).to eq("https://doi.org/10.82433/08qf-ee96")
176
205
  expect(subject.type).to eq("Instrument")
@@ -189,6 +189,31 @@ describe Commonmeta::Metadata, vcr: true do
189
189
  expect(subject.references).to be_nil
190
190
  end
191
191
 
192
+ it "medium post with institutional author" do
193
+ input = "https://api.rogue-scholar.org/posts/05f01f68-ef81-47d7-a3c1-40aba91d358f"
194
+ subject = described_class.new(input: input)
195
+ # expect(subject.valid?).to be true
196
+ expect(subject.id).to eq("https://doi.org/10.59350/jhrs4-22440")
197
+ expect(subject.url).to eq("https://medium.com/@researchgraph/unveiling-the-synergy-retrieval-augmented-generation-rag-meets-knowledge-graphs-fc0a6900f7eb")
198
+ expect(subject.alternate_identifiers).to eq([{ "alternateIdentifier" => "05f01f68-ef81-47d7-a3c1-40aba91d358f", "alternateIdentifierType" => "UUID" }])
199
+ expect(subject.type).to eq("Article")
200
+ expect(subject.contributors.length).to eq(1)
201
+ expect(subject.contributors.first).to eq("contributorRoles"=>["Author"], "name"=>"Research Graph", "type"=>"Organization")
202
+ expect(subject.titles).to eq([{ "title" => "Unveiling the Synergy: Retrieval Augmented Generation (RAG) Meets Knowledge Graphs" }])
203
+ expect(subject.license).to eq("id" => "CC-BY-4.0",
204
+ "url" => "https://creativecommons.org/licenses/by/4.0/legalcode")
205
+ expect(subject.date).to eq("published"=>"2024-01-18", "updated"=>"2024-01-18")
206
+ expect(subject.descriptions.first["description"]).to start_with("<strong> Tools and Platform for Integration of Knowledge Graph with RAG pipelines.")
207
+ expect(subject.publisher).to eq("name" => "Research Graph")
208
+ expect(subject.subjects).to eq([{ "subject" => "Computer and information sciences" },
209
+ { "schemeUri" => "http://www.oecd.org/science/inno/38235147.pdf",
210
+ "subject" => "FOS: Computer and information sciences",
211
+ "subjectScheme" => "Fields of Science and Technology (FOS)" }])
212
+ expect(subject.language).to eq("en")
213
+ expect(subject.container).to eq("identifier" => "https://medium.com/@researchgraph", "identifierType" => "URL", "title" => "Research Graph", "type" => "Periodical")
214
+ expect(subject.references).to be_nil
215
+ end
216
+
192
217
  it "syldavia gazette post with references" do
193
218
  input = "https://api.rogue-scholar.org/posts/0022b9ef-525a-4a79-81ad-13411697f58a"
194
219
  subject = described_class.new(input: input)
@@ -33,11 +33,38 @@ describe Commonmeta::Metadata, vcr: true do
33
33
  "volume" => "426",
34
34
  "firstPage" => "181",
35
35
  "containerTitle" => "Nature")
36
- expect(json["date"]).to eq("published"=>"2014-02-11", "updated"=>"2022-03-26")
36
+ expect(json["date"]).to eq("published" => "2014-02-11", "updated" => "2022-03-26")
37
37
  expect(json["descriptions"].first["description"]).to start_with("Among various advantages,")
38
- expect(json["license"]).to eq("id"=>"CC-BY-3.0", "url"=>"https://creativecommons.org/licenses/by/3.0/legalcode")
38
+ expect(json["license"]).to eq("id" => "CC-BY-3.0", "url" => "https://creativecommons.org/licenses/by/3.0/legalcode")
39
39
  expect(json["provider"]).to eq("Crossref")
40
- expect(json["files"].first).to eq("mimeType"=>"application/pdf", "url"=>"https://cdn.elifesciences.org/articles/01567/elife-01567-v1.pdf")
40
+ expect(json["files"].first).to eq("mimeType" => "application/pdf", "url" => "https://cdn.elifesciences.org/articles/01567/elife-01567-v1.pdf")
41
+ end
42
+
43
+ it "dataset schema v4.5" do
44
+ input = "#{fixture_path}datacite-dataset_v4.5.json"
45
+ subject = described_class.new(input: input)
46
+ expect(subject.id).to eq("https://doi.org/10.82433/b09z-4k37")
47
+ json = JSON.parse(subject.commonmeta)
48
+ expect(json["id"]).to eq("https://doi.org/10.82433/b09z-4k37")
49
+ expect(json["type"]).to eq("Dataset")
50
+ expect(json["titles"]).to eq([{ "language" => "en", "title" => "Example Title" },
51
+ { "language" => "en", "title" => "Example Subtitle", "type" => "Subtitle" },
52
+ { "language" => "fr",
53
+ "title" => "Example TranslatedTitle",
54
+ "type" => "TranslatedTitle" },
55
+ { "language" => "en",
56
+ "title" => "Example AlternativeTitle",
57
+ "type" => "AlternativeTitle" }])
58
+ expect(json["descriptions"]).to eq([{ "description" => "Example Abstract", "language" => "en", "type" => "Abstract" },
59
+ { "description" => "Example Methods", "language" => "en", "type" => "Methods" },
60
+ { "description" => "Example SeriesInformation",
61
+ "language" => "en",
62
+ "type" => "Other" },
63
+ { "description" => "Example TableOfContents", "language" => "en", "type" => "Other" },
64
+ { "description" => "Example TechnicalInfo",
65
+ "language" => "en",
66
+ "type" => "TechnicalInfo" },
67
+ { "description" => "Example Other", "language" => "en", "type" => "Other" }])
41
68
  end
42
69
  end
43
70
  end
@@ -7,6 +7,7 @@ describe Commonmeta::Metadata, vcr: true do
7
7
  it 'Dataset' do
8
8
  input = 'https://doi.org/10.5061/DRYAD.8515'
9
9
  subject = described_class.new(input: input, from: 'datacite')
10
+ puts subject.errors unless subject.valid?
10
11
  expect(subject.valid?).to be true
11
12
  json = JSON.parse(subject.csl)
12
13
  expect(json['type']).to eq('dataset')
@@ -37,6 +37,7 @@ describe Commonmeta::Metadata, vcr: true do
37
37
  it 'text' do
38
38
  input = 'https://doi.org/10.3204/desy-2014-01645'
39
39
  subject = described_class.new(input: input, from: 'datacite')
40
+ puts subject.errors unless subject.valid?
40
41
  expect(subject.valid?).to be true
41
42
  csv = subject.csv.parse_csv
42
43
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: commonmeta-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.11.0
4
+ version: 3.12.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Martin Fenner
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-01-26 00:00:00.000000000 Z
11
+ date: 2024-01-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -694,7 +694,7 @@ files:
694
694
  - lib/commonmeta/xml_converter.rb
695
695
  - resources/2008/09/xsd.xsl
696
696
  - resources/cff.json
697
- - resources/commonmeta_v0.10.6.json
697
+ - resources/commonmeta_v0.10.7.json
698
698
  - resources/crossref/AccessIndicators.xsd
699
699
  - resources/crossref/JATS-journalpublishing1-3d2-mathml3-elements.xsd
700
700
  - resources/crossref/JATS-journalpublishing1-3d2-mathml3.xsd
@@ -921,6 +921,7 @@ files:
921
921
  - spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/ghost_post_without_doi.yml
922
922
  - spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/jekyll_post.yml
923
923
  - spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/jekyll_post_with_anonymous_author.yml
924
+ - spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/medium_post_with_institutional_author.yml
924
925
  - spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/substack_post_with_broken_reference.yml
925
926
  - spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/syldavia_gazette_post_with_references.yml
926
927
  - spec/fixtures/vcr_cassettes/Commonmeta_Metadata/get_json_feed_item_metadata/upstream_post_with_references.yml