wikidata-diff-analyzer 0.1.1 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,71 @@
1
+ class RepresentationAnalyzer
2
+ def self.isolate_representation_differences(current_content, parent_content)
3
+ return {
4
+ changed: [],
5
+ removed: [],
6
+ added: []
7
+ } if current_content.nil? && parent_content.nil?
8
+
9
+ if current_content
10
+ current_representations = current_content['representations']
11
+ if current_representations.nil? || current_representations.is_a?(Array)
12
+ current_representations = {}
13
+ end
14
+ else
15
+ current_representations = {}
16
+ end
17
+
18
+ if parent_content
19
+ parent_representations = parent_content['representations']
20
+ if parent_representations.nil? || parent_representations.is_a?(Array)
21
+ parent_representations = {}
22
+ end
23
+ else
24
+ parent_representations = {}
25
+ end
26
+
27
+
28
+ changed = []
29
+ removed = []
30
+ added = []
31
+
32
+
33
+ # if parentid is 0, then add all labels as added and return it
34
+ if parent_content.nil?
35
+ current_representations.each do |lang, label|
36
+ added << { lang: lang }
37
+ end
38
+ return {
39
+ changed: changed,
40
+ removed: removed,
41
+ added: added
42
+ }
43
+ else
44
+
45
+
46
+ # Iterate over each language in the current labels
47
+ (current_representations || {}).each do |lang, current_representation|
48
+ parent_representation = parent_representations[lang]
49
+
50
+ if parent_representation.nil?
51
+ added << { lang: lang }
52
+ elsif current_representation != parent_representation
53
+ changed << { lang: lang }
54
+ end
55
+ end
56
+
57
+ # Iterate over each language in the parent labels to find removed labels
58
+ (parent_representations || {}).each do |lang, parent_representation|
59
+ if current_representations[lang].nil?
60
+ removed << { lang: lang }
61
+ end
62
+ end
63
+ end
64
+
65
+ {
66
+ changed: changed,
67
+ removed: removed,
68
+ added: added
69
+ }
70
+ end
71
+ end
@@ -3,48 +3,164 @@ require_relative 'alias_analyzer'
3
3
  require_relative 'label_analyzer'
4
4
  require_relative 'description_analyzer'
5
5
  require_relative 'sitelink_analyzer'
6
+ require_relative 'comment_analyzer'
7
+ require_relative 'form_analyzer'
8
+ require_relative 'sense_analyzer'
9
+ require_relative 'lemma_analyzer'
10
+
6
11
 
7
12
  class RevisionAnalyzer
13
+ CLAIM_TYPES = [:added_claims, :removed_claims, :changed_claims,:added_references, :removed_references, :changed_references,:added_qualifiers, :removed_qualifiers, :changed_qualifiers].freeze
14
+ ALIAS_TYPES = [:added_aliases, :removed_aliases, :changed_aliases].freeze
15
+ LABEL_TYPES = [:added_labels, :removed_labels, :changed_labels].freeze
16
+ DESCRIPTION_TYPES = [:added_descriptions, :removed_descriptions, :changed_descriptions].freeze
17
+ SITELINK_TYPES = [:added_sitelinks, :removed_sitelinks, :changed_sitelinks].freeze
18
+ COMMENT_TYPES = [:merge_to, :merge_from, :redirect, :undo, :restore, :clear_item].freeze
19
+ LEMMA_TYPES = [:added_lemmas, :removed_lemmas, :changed_lemmas].freeze
20
+ FORM_TYPES = [:added_forms, :removed_forms, :changed_forms, :added_representations, :removed_representations, :changed_representations, :added_formclaims, :removed_formclaims, :changed_formclaims].freeze
21
+ SENSE_TYPES = [:added_senses, :removed_senses, :changed_senses, :added_glosses, :removed_glosses, :changed_glosses, :added_senseclaims, :removed_senseclaims, :changed_senseclaims].freeze
22
+ NOT_IN_ITEM = [:create_lexeme, :create_property, :added_lemmas, :removed_lemmas, :changed_lemmas, :added_forms, :removed_forms, :changed_forms, :added_senses, :removed_senses, :changed_senses, :added_representations, :removed_representations, :changed_representations, :added_glosses, :removed_glosses, :changed_glosses, :added_formclaims, :removed_formclaims, :changed_formclaims, :added_senseclaims, :removed_senseclaims, :changed_senseclaims].freeze
23
+ NOT_IN_PROPERTY = [:create_lexeme, :create_item, :added_sitelinks, :removed_sitelinks, :changed_sitelinks, :added_lemmas, :removed_lemmas, :changed_lemmas, :added_forms, :removed_forms, :changed_forms, :added_senses, :removed_senses, :changed_senses, :added_representations, :removed_representations, :changed_representations, :added_glosses, :removed_glosses, :changed_glosses, :added_formclaims, :removed_formclaims, :changed_formclaims, :added_senseclaims, :removed_senseclaims, :changed_senseclaims].freeze
24
+ NOT_IN_LEXEME = [:create_item, :create_property, :added_sitelinks, :changed_sitelinks, :removed_sitelinks, :added_aliases, :changed_aliases, :removed_aliases, :added_labels, :changed_labels, :removed_labels, :added_descriptions, :changed_descriptions, :removed_descriptions].freeze
25
+
8
26
  # This method takes two revisions as input and returns the differences between them.
9
- def self.analyze_diff(current_content, parent_content)
27
+ def self.analyze_diff(revision_data)
28
+ model = revision_data[:model]
10
29
  diff = {}
11
- # Calculate claim differences includes references and qualifiers
12
- claim_diff = ClaimAnalyzer.isolate_claim_differences(current_content, parent_content)
13
- diff[:added_claims] = claim_diff[:added_claims].length
14
- diff[:removed_claims] = claim_diff[:removed_claims].length
15
- diff[:changed_claims] = claim_diff[:changed_claims].length
16
- diff[:added_references] = claim_diff[:added_references].length
17
- diff[:removed_references] = claim_diff[:removed_references].length
18
- diff[:changed_references] = claim_diff[:changed_references].length
19
- diff[:added_qualifiers] = claim_diff[:added_qualifiers].length
20
- diff[:removed_qualifiers] = claim_diff[:removed_qualifiers].length
21
- diff[:changed_qualifiers] = claim_diff[:changed_qualifiers].length
30
+ if model == 'wikibase-item'
31
+ item(diff, revision_data)
32
+ elsif model == 'wikibase-property'
33
+ property(diff, revision_data)
34
+ elsif model == 'wikibase-lexeme'
35
+ lexeme(diff, revision_data)
36
+ end
37
+ diff
38
+ end
39
+
40
+ def self.item(diff, revision_data)
41
+ current_content = revision_data[:current_content]
42
+ parent_content = revision_data[:parent_content]
43
+ comment = revision_data[:comment]
44
+
45
+
46
+ claim_diff = ClaimAnalyzer.isolate_claims_differences(current_content, parent_content)
47
+ CLAIM_TYPES.each do |change_type|
48
+ diff[change_type] = claim_diff[change_type].length
49
+ end
50
+
51
+ alias_diff = AliasAnalyzer.isolate_aliases_differences(current_content, parent_content)
52
+ ALIAS_TYPES.each do |change_type|
53
+ diff[change_type] = alias_diff[change_type].length
54
+ end
55
+
56
+ # Calculate label differences
57
+ label_diff = LabelAnalyzer.isolate_labels_differences(current_content, parent_content)
58
+ LABEL_TYPES.each do |change_type|
59
+ diff[change_type] = label_diff[change_type].length
60
+ end
61
+
62
+ # Calculate description differences
63
+ description_diff = DescriptionAnalyzer.isolate_descriptions_differences(current_content, parent_content)
64
+ DESCRIPTION_TYPES.each do |change_type|
65
+ diff[change_type] = description_diff[change_type].length
66
+ end
67
+
68
+ # Calculate sitelink differences
69
+ sitelink_diff = SitelinkAnalyzer.isolate_sitelinks_differences(current_content, parent_content)
70
+ SITELINK_TYPES.each do |change_type|
71
+ diff[change_type] = sitelink_diff[change_type].length
72
+ end
73
+
74
+ phrases = CommentAnalyzer.isolate_comment_differences(comment)
75
+ COMMENT_TYPES.each do |change_type|
76
+ diff[change_type] = phrases[change_type]
77
+ end
78
+
79
+ NOT_IN_ITEM.each do |change_type|
80
+ diff[change_type] = 0
81
+ end
82
+
83
+ diff[:create_item] = phrases[:create_item]
84
+
85
+ diff
86
+ end
87
+
88
+ def self.property(diff, revision_data)
89
+ current_content = revision_data[:current_content]
90
+ parent_content = revision_data[:parent_content]
91
+ comment = revision_data[:comment]
92
+
93
+ claim_diff = ClaimAnalyzer.isolate_claims_differences(current_content, parent_content)
94
+ CLAIM_TYPES.each do |change_type|
95
+ diff[change_type] = claim_diff[change_type].length
96
+ end
97
+
98
+ alias_diff = AliasAnalyzer.isolate_aliases_differences(current_content, parent_content)
99
+ ALIAS_TYPES.each do |change_type|
100
+ diff[change_type] = alias_diff[change_type].length
101
+ end
102
+
103
+ # Calculate label differences
104
+ label_diff = LabelAnalyzer.isolate_labels_differences(current_content, parent_content)
105
+
106
+ LABEL_TYPES.each do |change_type|
107
+ diff[change_type] = label_diff[change_type].length
108
+ end
109
+
110
+ # Calculate description differences
111
+ description_diff = DescriptionAnalyzer.isolate_descriptions_differences(current_content, parent_content)
112
+ DESCRIPTION_TYPES.each do |change_type|
113
+ diff[change_type] = description_diff[change_type].length
114
+ end
115
+
116
+
117
+ phrases = CommentAnalyzer.isolate_comment_differences(comment)
118
+ COMMENT_TYPES.each do |change_type|
119
+ diff[change_type] = phrases[change_type]
120
+ end
121
+
122
+ diff[:create_property] = phrases[:create_item]
123
+
124
+ NOT_IN_PROPERTY.each do |change_type|
125
+ diff[change_type] = 0
126
+ end
127
+ end
128
+
129
+ def self.lexeme(diff, revision_data)
130
+ current_content = revision_data[:current_content]
131
+ parent_content = revision_data[:parent_content]
132
+ comment = revision_data[:comment]
22
133
 
23
- # Calculate alias differences
24
- alias_diff = AliasAnalyzer.isolate_aliases_differences(current_content, parent_content)
25
- diff[:added_aliases] = alias_diff[:added].length
26
- diff[:removed_aliases] = alias_diff[:removed].length
27
- diff[:changed_aliases] = alias_diff[:changed].length
28
-
29
-
30
- # Calculate label differences
31
- label_diff = LabelAnalyzer.isolate_labels_differences(current_content, parent_content)
32
- diff[:added_labels] = label_diff[:added].length
33
- diff[:removed_labels] = label_diff[:removed].length
34
- diff[:changed_labels] = label_diff[:changed].length
35
-
36
- # Calculate description differences
37
- description_diff = DescriptionAnalyzer.isolate_descriptions_differences(current_content, parent_content)
38
- diff[:added_descriptions] = description_diff[:added].length
39
- diff[:removed_descriptions] = description_diff[:removed].length
40
- diff[:changed_descriptions] = description_diff[:changed].length
41
-
42
- # Calculate sitelink differences
43
- sitelink_diff = SitelinkAnalyzer.isolate_sitelinks_differences(current_content, parent_content)
44
- diff[:added_sitelinks] = sitelink_diff[:added].length
45
- diff[:removed_sitelinks] = sitelink_diff[:removed].length
46
- diff[:changed_sitelinks] = sitelink_diff[:changed].length
134
+ claim_diff = ClaimAnalyzer.isolate_claims_differences(current_content, parent_content)
135
+ CLAIM_TYPES.each do |change_type|
136
+ diff[change_type] = claim_diff[change_type].length
137
+ end
47
138
 
48
- diff
139
+ forms_diff = FormAnalyzer.isolate_forms_differences(current_content, parent_content)
140
+ FORM_TYPES.each do |change_type|
141
+ diff[change_type] = forms_diff[change_type].length
142
+ end
143
+
144
+ # Calculate label differences
145
+ lemmas_diff = LemmaAnalyzer.isolate_lemmas_differences(current_content, parent_content)
146
+ LEMMA_TYPES.each do |change_type|
147
+ diff[change_type] = lemmas_diff[change_type].length
148
+ end
149
+
150
+ # Calculate description differences
151
+ senses_diff = SenseAnalyzer.isolate_senses_differences(current_content, parent_content)
152
+ SENSE_TYPES.each do |change_type|
153
+ diff[change_type] = senses_diff[change_type].length
154
+ end
155
+
156
+ phrases = CommentAnalyzer.isolate_comment_differences(comment)
157
+ COMMENT_TYPES.each do |change_type|
158
+ diff[change_type] = phrases[change_type]
159
+ end
160
+
161
+ NOT_IN_LEXEME.each do |change_type|
162
+ diff[change_type] = 0
163
+ end
164
+ diff[:create_lexeme] = phrases[:create_item]
49
165
  end
50
166
  end
@@ -0,0 +1,106 @@
1
+ require_relative 'gloss_analyzer'
2
+ require_relative 'inside_claim_analyzer'
3
+ class SenseAnalyzer
4
+ def self.isolate_senses_differences(current_content, parent_content)
5
+ # Initialize empty arrays to store the added, removed, and changed claims
6
+ added_senses = []
7
+ removed_senses = []
8
+ changed_senses = []
9
+ added_glosses = []
10
+ removed_glosses = []
11
+ changed_glosses = []
12
+ added_senseclaims = []
13
+ removed_senseclaims = []
14
+ changed_senseclaims = []
15
+
16
+ current_content_senses = current_content["senses"] if current_content
17
+ parent_content_senses = parent_content["senses"] if parent_content
18
+
19
+ if !current_content_senses.is_a?(Array) || !parent_content_senses.is_a?(Array)
20
+ return {
21
+ added_senses: added_senses,
22
+ removed_senses: removed_senses,
23
+ changed_senses: changed_senses,
24
+ added_glosses: added_glosses,
25
+ removed_glosses: removed_glosses,
26
+ changed_glosses: changed_glosses,
27
+ added_senseclaims: added_senseclaims,
28
+ removed_senseclaims: removed_senseclaims,
29
+ changed_senseclaims: changed_senseclaims
30
+ }
31
+ end
32
+
33
+ current_content_senses = current_content["senses"] || []
34
+ parent_content_senses = parent_content["senses"] || []
35
+
36
+ # if parentid is 0, add all current claims as added claims and return it
37
+ if parent_content.nil?
38
+ current_content_senses.each_with_index do |current_claim, index|
39
+ added_senses << { index: index }
40
+ glosses = GlossAnalyzer.isolate_gloss_differences(current_claim, nil)
41
+ added_glosses += glosses[:added]
42
+ removed_glosses += glosses[:removed]
43
+ changed_glosses += glosses[:changed]
44
+ senseclaims = InsideClaimAnalyzer.isolate_inside_claim_differences(current_claim, nil)
45
+ added_senseclaims += senseclaims[:added]
46
+ removed_senseclaims += senseclaims[:removed]
47
+ changed_senseclaims += senseclaims[:changed]
48
+ end
49
+ else
50
+ current_content_senses.each_with_index do |current_claim, index|
51
+ parent_claim = parent_content_senses[index]
52
+ if parent_claim.nil?
53
+ # Claim was added
54
+ added_senses << { index: index }
55
+ glosses = GlossAnalyzer.isolate_gloss_differences(current_claim, parent_claim)
56
+ added_glosses += glosses[:added]
57
+ removed_glosses += glosses[:removed]
58
+ changed_glosses += glosses[:changed]
59
+ senseclaims = InsideClaimAnalyzer.isolate_inside_claim_differences(current_claim, nil)
60
+ added_senseclaims += senseclaims[:added]
61
+ removed_senseclaims += senseclaims[:removed]
62
+ changed_senseclaims += senseclaims[:changed]
63
+ elsif current_claim != parent_claim
64
+ # Claim was changed
65
+ changed_senses << { index: index }
66
+ glosses = GlossAnalyzer.isolate_gloss_differences(current_claim, parent_claim)
67
+ added_glosses += glosses[:added]
68
+ removed_glosses += glosses[:removed]
69
+ changed_glosses += glosses[:changed]
70
+ senseclaims = InsideClaimAnalyzer.isolate_inside_claim_differences(current_claim, nil)
71
+ added_senseclaims += senseclaims[:added]
72
+ removed_senseclaims += senseclaims[:removed]
73
+ changed_senseclaims += senseclaims[:changed]
74
+ end
75
+ end
76
+ end
77
+
78
+ # Iterate over each claim key in the parent content
79
+ parent_content_senses.each_with_index do |parent_claim, index|
80
+ current_claim = current_content_senses[index]
81
+ if current_claim.nil?
82
+ # Claim was removed
83
+ removed_senses << { index: index }
84
+ glosses = GlossAnalyzer.isolate_gloss_differences(current_claim, parent_claim)
85
+ added_glosses += glosses[:added]
86
+ removed_glosses += glosses[:removed]
87
+ changed_glosses += glosses[:changed]
88
+ senseclaims = InsideClaimAnalyzer.isolate_inside_claim_differences(current_claim, nil)
89
+ added_senseclaims += senseclaims[:added]
90
+ removed_senseclaims += senseclaims[:removed]
91
+ changed_senseclaims += senseclaims[:changed]
92
+ end
93
+ end
94
+ {
95
+ added_senses: added_senses,
96
+ removed_senses: removed_senses,
97
+ changed_senses: changed_senses,
98
+ added_glosses: added_glosses,
99
+ removed_glosses: removed_glosses,
100
+ changed_glosses: changed_glosses,
101
+ added_senseclaims: added_senseclaims,
102
+ removed_senseclaims: removed_senseclaims,
103
+ changed_senseclaims: changed_senseclaims
104
+ }
105
+ end
106
+ end
@@ -47,15 +47,11 @@ class SitelinkAnalyzer
47
47
  # All sitelinks are removed if current content is nil
48
48
  removed_sitelinks = parent_content['sitelinks']
49
49
  end
50
-
51
- # puts "Added sitelinks: #{added_sitelinks}"
52
- # puts "Removed sitelinks: #{removed_sitelinks}"
53
- # puts "Changed sitelinks: #{changed_sitelinks}"
54
50
 
55
51
  {
56
- added: added_sitelinks,
57
- removed: removed_sitelinks,
58
- changed: changed_sitelinks
52
+ added_sitelinks: added_sitelinks,
53
+ removed_sitelinks: removed_sitelinks,
54
+ changed_sitelinks: changed_sitelinks
59
55
  }
60
56
  end
61
57
  end
@@ -21,5 +21,36 @@ class Total
21
21
  total[:sitelinks_added] += diff_data[:added_sitelinks]
22
22
  total[:sitelinks_removed] += diff_data[:removed_sitelinks]
23
23
  total[:sitelinks_changed] += diff_data[:changed_sitelinks]
24
+ total[:lemmas_added] += diff_data[:added_lemmas]
25
+ total[:lemmas_removed] += diff_data[:removed_lemmas]
26
+ total[:lemmas_changed] += diff_data[:changed_lemmas]
27
+ total[:forms_added] += diff_data[:added_forms]
28
+ total[:forms_removed] += diff_data[:removed_forms]
29
+ total[:forms_changed] += diff_data[:changed_forms]
30
+ total[:representations_added] += diff_data[:added_representations]
31
+ total[:representations_removed] += diff_data[:removed_representations]
32
+ total[:representations_changed] += diff_data[:changed_representations]
33
+ total[:formclaims_added] += diff_data[:added_formclaims]
34
+ total[:formclaims_removed] += diff_data[:removed_formclaims]
35
+ total[:formclaims_changed] += diff_data[:changed_formclaims]
36
+ total[:senses_added] += diff_data[:added_senses]
37
+ total[:senses_removed] += diff_data[:removed_senses]
38
+ total[:senses_changed] += diff_data[:changed_senses]
39
+ total[:glosses_added] += diff_data[:added_glosses]
40
+ total[:glosses_removed] += diff_data[:removed_glosses]
41
+ total[:glosses_changed] += diff_data[:changed_glosses]
42
+ total[:senseclaims_added] += diff_data[:added_senseclaims]
43
+ total[:senseclaims_removed] += diff_data[:removed_senseclaims]
44
+ total[:senseclaims_changed] += diff_data[:changed_senseclaims]
45
+ total[:merge_from] += diff_data[:merge_from]
46
+ total[:merge_to] += diff_data[:merge_to]
47
+ total[:undo] += diff_data[:undo]
48
+ total[:restore] += diff_data[:restore]
49
+ total[:clear_item] += diff_data[:clear_item]
50
+ total[:create_item] += diff_data[:create_item]
51
+ total[:create_property] += diff_data[:create_property]
52
+ total[:create_lexeme] += diff_data[:create_lexeme]
53
+ total[:redirect] += diff_data[:redirect]
54
+
24
55
  end
25
56
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wikidata-diff-analyzer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sulagna Saha
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-07-05 00:00:00.000000000 Z
11
+ date: 2023-07-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: json
@@ -34,6 +34,7 @@ extra_rdoc_files: []
34
34
  files:
35
35
  - CHANGELOG.md
36
36
  - CODE_OF_CONDUCT.md
37
+ - CONTRIBUTING.md
37
38
  - Gemfile
38
39
  - Gemfile.lock
39
40
  - LICENSE
@@ -44,11 +45,19 @@ files:
44
45
  - lib/wikidata/diff/analyzer/version.rb
45
46
  - lib/wikidata/diff/api.rb
46
47
  - lib/wikidata/diff/claim_analyzer.rb
48
+ - lib/wikidata/diff/comment_analyzer.rb
47
49
  - lib/wikidata/diff/description_analyzer.rb
50
+ - lib/wikidata/diff/form_analyzer.rb
51
+ - lib/wikidata/diff/gloss_analyzer.rb
52
+ - lib/wikidata/diff/inside_claim_analyzer.rb
48
53
  - lib/wikidata/diff/label_analyzer.rb
49
54
  - lib/wikidata/diff/large_batches_analyzer.rb
50
- - lib/wikidata/diff/mediawiki_login.rb
55
+ - lib/wikidata/diff/lemma_analyzer.rb
56
+ - lib/wikidata/diff/qualifier_analyzer.rb
57
+ - lib/wikidata/diff/reference_analyzer.rb
58
+ - lib/wikidata/diff/representation_analyzer.rb
51
59
  - lib/wikidata/diff/revision_analyzer.rb
60
+ - lib/wikidata/diff/sense_analyzer.rb
52
61
  - lib/wikidata/diff/sitelink_analyzer.rb
53
62
  - lib/wikidata/diff/total.rb
54
63
  - sig/wikidata/diff/analyzer.rbs
@@ -1,12 +0,0 @@
1
- require 'mediawiki_api'
2
- # to load env variable
3
- require 'dotenv/load'
4
-
5
- # THIS IS NOT WORKING YET
6
- class MediawikiLogin
7
- def self.mediawiki_login
8
- client = MediawikiApi::Client.new('https://www.mediawiki.org/w/api.php')
9
- client.log_in(ENV['MEDIAWIKI_USERNAME'], ENV['MEDIAWIKI_PASSWORD'])
10
- client.logged_in? # Return whether login was successful
11
- end
12
- end