wikidata-diff-analyzer 0.1.1 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,67 @@
1
+ require_relative 'representation_analyzer'
2
+ require_relative 'inside_claim_analyzer'
3
+ class FormAnalyzer
4
+ def self.isolate_forms_differences(current_content, parent_content)
5
+ added_forms = []
6
+ removed_forms = []
7
+ changed_forms = []
8
+ added_representations = []
9
+ removed_representations = []
10
+ changed_representations = []
11
+ added_formclaims = []
12
+ removed_formclaims = []
13
+ changed_formclaims = []
14
+
15
+ current_forms = current_content&.fetch("forms", []) || []
16
+ parent_forms = parent_content&.fetch("forms", []) || []
17
+
18
+ current_forms.each_with_index do |current_form, index|
19
+ parent_form = parent_forms[index]
20
+
21
+ if parent_form.nil?
22
+ # Claim was added
23
+ added_forms << { index: index }
24
+ elsif current_form
25
+ # Claim was changed
26
+ changed_forms << { index: index }
27
+ end
28
+
29
+ representations = RepresentationAnalyzer.isolate_representation_differences(current_form, parent_form)
30
+ added_representations += representations[:added]
31
+ removed_representations += representations[:removed]
32
+ changed_representations += representations[:changed]
33
+
34
+ formclaims = InsideClaimAnalyzer.isolate_inside_claim_differences(current_form, parent_form)
35
+ added_formclaims += formclaims[:added]
36
+ removed_formclaims += formclaims[:removed]
37
+ changed_formclaims += formclaims[:changed]
38
+ end
39
+
40
+ parent_forms.each_with_index do |parent_form, index|
41
+ current_form = current_forms[index]
42
+
43
+ if current_form.nil?
44
+ # Claim was removed
45
+ removed_forms << { index: index }
46
+
47
+ representations = RepresentationAnalyzer.isolate_representation_differences(nil, parent_form)
48
+ removed_representations += representations[:removed]
49
+
50
+ formclaims = InsideClaimAnalyzer.isolate_inside_claim_differences(nil, parent_form)
51
+ removed_formclaims += formclaims[:removed]
52
+ end
53
+ end
54
+
55
+ {
56
+ added_forms: added_forms,
57
+ removed_forms: removed_forms,
58
+ changed_forms: changed_forms,
59
+ added_representations: added_representations,
60
+ removed_representations: removed_representations,
61
+ changed_representations: changed_representations,
62
+ added_formclaims: added_formclaims,
63
+ removed_formclaims: removed_formclaims,
64
+ changed_formclaims: changed_formclaims
65
+ }
66
+ end
67
+ end
@@ -0,0 +1,71 @@
1
+ class GlossAnalyzer
2
+ def self.isolate_gloss_differences(current_content, parent_content)
3
+ return {
4
+ changed: [],
5
+ removed: [],
6
+ added: []
7
+ } if current_content.nil? && parent_content.nil?
8
+
9
+ if current_content
10
+ current_glosses = current_content['glosses']
11
+ if current_glosses.nil? || current_glosses.is_a?(Array)
12
+ current_glosses = {}
13
+ end
14
+ else
15
+ current_glosses = {}
16
+ end
17
+
18
+ if parent_content
19
+ parent_glosses = parent_content['glosses']
20
+ if parent_glosses.nil? || parent_glosses.is_a?(Array)
21
+ parent_glosses = {}
22
+ end
23
+ else
24
+ parent_glosses = {}
25
+ end
26
+
27
+
28
+
29
+ changed = []
30
+ removed = []
31
+ added = []
32
+
33
+ # if parentid is 0, then add all labels as added and return it
34
+ if parent_content.nil?
35
+ current_glosses.each do |lang, label|
36
+ added << { lang: lang }
37
+ end
38
+ return {
39
+ changed: changed,
40
+ removed: removed,
41
+ added: added
42
+ }
43
+ else
44
+
45
+
46
+ # Iterate over each language in the current labels
47
+ (current_glosses || {}).each do |lang, current_gloss|
48
+ parent_gloss = parent_glosses[lang]
49
+
50
+ if parent_gloss.nil?
51
+ added << { lang: lang }
52
+ elsif current_gloss != parent_gloss
53
+ changed << { lang: lang }
54
+ end
55
+ end
56
+
57
+ # Iterate over each language in the parent labels to find removed labels
58
+ (parent_glosses || {}).each do |lang, parent_gloss|
59
+ if current_glosses[lang].nil?
60
+ removed << { lang: lang }
61
+ end
62
+ end
63
+ end
64
+
65
+ {
66
+ changed: changed,
67
+ removed: removed,
68
+ added: added
69
+ }
70
+ end
71
+ end
@@ -0,0 +1,84 @@
1
+ class InsideClaimAnalyzer
2
+ def self.isolate_inside_claim_differences(current_content, parent_content)
3
+ # Initialize empty arrays to store the added, removed, and changed claims
4
+ added = []
5
+ removed = []
6
+ changed = []
7
+
8
+ if current_content.nil?
9
+ current_content_claims = {}
10
+ else
11
+ current_content_claims = current_content["claims"]
12
+ if !current_content_claims.is_a?(Hash)
13
+ current_content_claims = {}
14
+ end
15
+ end
16
+
17
+
18
+ if parent_content.nil?
19
+ parent_content_claims = {}
20
+ else
21
+ parent_content_claims = parent_content["claims"]
22
+ if !parent_content_claims.is_a?(Hash)
23
+ parent_content_claims = {}
24
+ end
25
+ end
26
+
27
+ # if parentid is 0, add all current claims as added claims and return it
28
+ if parent_content.nil?
29
+ current_content_claims.each do |claim_key, current_claims|
30
+ current_claims.each_with_index do |current_claim, index|
31
+ added << { key: claim_key, index: index }
32
+ end
33
+ end
34
+ else
35
+ # Iterate over each claim key in the current content
36
+ current_content_claims.each do |claim_key, current_claims|
37
+ # Check if the claim key exists in the parent content
38
+ if parent_content_claims.key?(claim_key)
39
+ parent_claims = parent_content_claims[claim_key]
40
+ # Iterate over each claim in the current and parent content
41
+ current_claims.each_with_index do |current_claim, index|
42
+ parent_claim = parent_claims[index]
43
+ if parent_claim.nil?
44
+ # Claim was added
45
+ added << { key: claim_key, index: index }
46
+
47
+ elsif current_claim != parent_claim
48
+ # Claim was changed
49
+ changed << { key: claim_key, index: index }
50
+ end
51
+ end
52
+ # Check for removed claims
53
+ parent_claims.each_with_index do |parent_claim, index|
54
+ current_claim = current_claims[index]
55
+ if current_claim.nil?
56
+ # Claim was removed
57
+ removed << { key: claim_key, index: index }
58
+ end
59
+ end
60
+ else
61
+ # All claims in current content with this key were added
62
+ current_claims.each_index do |index|
63
+ added << { key: claim_key, index: index }
64
+ end
65
+ end
66
+ end
67
+
68
+ parent_content_claims.each do |claim_key, parent_claims|
69
+ # current content[claims] can be nil
70
+ parent_claims.each_index do |index|
71
+ if current_content_claims.nil? || !current_content_claims.key?(claim_key)
72
+ removed << { key: claim_key, index: index }
73
+ end
74
+ end
75
+ end
76
+ end
77
+
78
+ {
79
+ added: added,
80
+ removed: removed,
81
+ changed: changed
82
+ }
83
+ end
84
+ end
@@ -1,53 +1,75 @@
1
- class LabelAnalyzer
1
+ class LabelAnalyzer
2
+
2
3
  def self.isolate_labels_differences(current_content, parent_content)
3
4
  return {
4
- changed: [],
5
- removed: [],
6
- added: []
5
+ changed_labels: [],
6
+ removed_labels: [],
7
+ added_labels: []
7
8
  } if current_content.nil? && parent_content.nil?
8
-
9
- current_labels = current_content['labels'] || {}
10
- parent_labels = parent_content['labels'] || {}
11
-
12
- changed_labels = []
13
- removed_labels = []
14
- added_labels = []
15
-
16
- if current_labels.is_a?(Array) || parent_labels.is_a?(Array)
17
- return {
18
- changed: changed_labels,
19
- removed: removed_labels,
20
- added: added_labels
21
- }
9
+
10
+ if current_content
11
+ current_labels = current_content['labels']
12
+ if current_labels.nil? || current_labels.is_a?(Array)
13
+ current_labels = {}
14
+ end
15
+ else
16
+ current_labels = {}
22
17
  end
23
18
 
24
-
25
- # Iterate over each language in the current labels
26
- (current_labels || {}).each do |lang, current_label|
27
- parent_label = parent_labels[lang]
28
-
29
- if parent_label.nil?
30
- added_labels << { lang: lang }
31
- elsif current_label != parent_label
32
- changed_labels << { lang: lang }
33
- end
34
- end
35
-
36
- # Iterate over each language in the parent labels to find removed labels
37
- (parent_labels || {}).each do |lang, parent_label|
38
- if current_labels[lang].nil?
39
- removed_labels << { lang: lang }
40
- end
19
+ if parent_content
20
+ parent_labels = parent_content['labels']
21
+ if parent_labels.nil? || parent_labels.is_a?(Array)
22
+ parent_labels = {}
23
+ end
24
+ else
25
+ parent_labels = {}
41
26
  end
42
27
 
43
- # puts "Changed labels: #{changed_labels}"
44
- # puts "Removed labels: #{removed_labels}"
45
- # puts "Added labels: #{added_labels}"
28
+
29
+
30
+ changed_labels_labels = []
31
+ removed_labels_labels = []
32
+ added_labels_labels = []
33
+
34
+ # if parentid is 0, then add all labels as added_labels and return it
35
+ if parent_content.nil?
36
+ if !current_labels.empty?
37
+ current_labels.each do |lang, label|
38
+ added_labels_labels << { lang: lang }
39
+ end
40
+ end
41
+ return {
42
+ changed_labels: changed_labels_labels,
43
+ removed_labels: removed_labels_labels,
44
+ added_labels: added_labels_labels
45
+ }
46
+ else
47
+ # Iterate over each language in the current labels
48
+ (current_labels).each do |lang, current_label|
49
+ if parent_labels.empty?
50
+ added_labels_labels << { lang: lang }
51
+ else
52
+ parent_label = parent_labels[lang]
53
+ if parent_label.nil?
54
+ added_labels_labels << { lang: lang }
55
+ elsif current_label != parent_label
56
+ changed_labels_labels << { lang: lang }
57
+ end
58
+ end
59
+ end
60
+
61
+ # Iterate over each language in the parent labels to find removed_labels labels
62
+ (parent_labels).each do |lang, parent_label|
63
+ if current_labels.empty?
64
+ removed_labels_labels << { lang: lang }
65
+ end
66
+ end
67
+ end
46
68
 
47
69
  {
48
- changed: changed_labels,
49
- removed: removed_labels,
50
- added: added_labels
70
+ changed_labels: changed_labels_labels,
71
+ removed_labels: removed_labels_labels,
72
+ added_labels: added_labels_labels
51
73
  }
52
74
  end
53
75
  end
@@ -5,34 +5,49 @@ class LargeBatchesAnalyzer
5
5
  def self.handle_large_batches(revision_ids, batch_size)
6
6
  revision_contents = {}
7
7
  parent_contents = {}
8
-
9
-
10
- revision_ids_batches = revision_ids.each_slice(batch_size).to_a
11
- revision_ids_batches.each do |batch|
12
- parsed_contents = Api.get_revision_contents(batch)
13
- if parsed_contents
14
- parent_ids = []
15
- revision_contents.merge!(parsed_contents) if parsed_contents
16
- parsed_contents.values.each do |data|
17
- parent_id = data[:parentid]
18
-
19
- if parent_id != 0 && !parent_id.nil?
20
- parent_ids << parent_id
21
- end
22
- end
23
- parent_contents_batch = Api.get_revision_contents(parent_ids)
24
- parent_contents.merge!(parent_contents_batch) if parent_contents_batch
8
+ first_revisions = []
9
+
10
+ revision_ids.each_slice(batch_size) do |batch|
11
+ parent_ids = []
12
+ parsed_contents = Api.get_revision_contents(batch)
13
+ next unless parsed_contents
14
+
15
+ # I have to check if any of the revision ids in the parsed content has parentid == 0
16
+ parsed_contents.each do |revid, data|
17
+ if data[:parentid] == 0
18
+ first_revisions << revid
19
+ else
20
+ parent_ids << data[:parentid]
25
21
  end
22
+ end
23
+ revision_contents.merge!(parsed_contents)
24
+ parent_contents_batch = Api.get_revision_contents(parent_ids)
25
+ parent_contents.merge!(parent_contents_batch) if parent_contents_batch
26
26
  end
27
-
27
+
28
+ build_result(revision_contents, parent_contents, first_revisions)
29
+ end
30
+
31
+ def self.build_result(revision_contents, parent_contents, first_revisions)
28
32
  result = {}
29
33
  revision_contents.each do |revid, data|
30
- parentid = data[:parentid]
31
- parent_content = parent_contents[parentid] if parentid
32
- current = data ? data[:content] : nil
33
- parent = parent_content ? parent_content[:content] : nil
34
- result[revid] = { current_content: current, parent_content: parent }
34
+ parent_content = parent_contents[data[:parentid]]
35
+ result[revid] = {
36
+ current_content: data&.fetch(:content, nil),
37
+ parent_content: parent_content&.fetch(:content, nil),
38
+ comment: data&.fetch(:comment, nil),
39
+ model: data&.fetch(:model, nil)
40
+ }
41
+ end
42
+ first_revisions.each do |revid|
43
+ result[revid] = {
44
+ current_content: revision_contents[revid]&.fetch(:content, nil),
45
+ parent_content: nil,
46
+ comment: revision_contents[revid]&.fetch(:comment, nil),
47
+ model: revision_contents[revid]&.fetch(:model, nil)
48
+ }
35
49
  end
36
50
  result
37
- end
51
+ end
52
+
38
53
  end
@@ -0,0 +1,70 @@
1
+ class LemmaAnalyzer
2
+ def self.isolate_lemmas_differences(current_content, parent_content)
3
+ return {
4
+ changed_lemmas: [],
5
+ removed_lemmas: [],
6
+ added_lemmas: []
7
+ } if current_content.nil? && parent_content.nil?
8
+
9
+
10
+ if current_content
11
+ current_labels = current_content['lemmas']
12
+ if current_labels.nil? || current_labels.is_a?(Array)
13
+ current_labels = {}
14
+ end
15
+ else
16
+ current_labels = {}
17
+ end
18
+ if parent_content
19
+ parent_labels = parent_content['lemmas']
20
+ if parent_labels.nil? || parent_labels.is_a?(Array)
21
+ parent_labels = {}
22
+ end
23
+ else
24
+ parent_labels = {}
25
+ end
26
+
27
+ changed_labels = []
28
+ removed_labels = []
29
+ added_labels = []
30
+
31
+
32
+ # if parentid is 0, then add all labels as added and return it
33
+ if parent_content.nil?
34
+ current_labels.each do |lang, label|
35
+ added_labels << { lang: lang }
36
+ end
37
+ return {
38
+ changed_lemmas: changed_labels,
39
+ removed_lemmas: removed_labels,
40
+ added_lemmas: added_labels
41
+ }
42
+ else
43
+
44
+
45
+ # Iterate over each language in the current labels
46
+ (current_labels || {}).each do |lang, current_label|
47
+ parent_label = parent_labels[lang]
48
+
49
+ if parent_label.nil?
50
+ added_labels << { lang: lang }
51
+ elsif current_label != parent_label
52
+ changed_labels << { lang: lang }
53
+ end
54
+ end
55
+
56
+ # Iterate over each language in the parent labels to find removed labels
57
+ (parent_labels || {}).each do |lang, parent_label|
58
+ if current_labels[lang].nil?
59
+ removed_labels << { lang: lang }
60
+ end
61
+ end
62
+ end
63
+
64
+ {
65
+ changed_lemmas: changed_labels,
66
+ removed_lemmas: removed_labels,
67
+ added_lemmas: added_labels
68
+ }
69
+ end
70
+ end
@@ -0,0 +1,83 @@
1
+ class QualifierAnalyzer
2
+ # helper method for adding qualifiers
3
+ # handles added and removed qualifiers
4
+ def self.qualifier_updates(claim, updated_qualifiers, claim_key, claim_index)
5
+ if claim["qualifiers"]
6
+ qualifiers = claim["qualifiers"]
7
+ qualifiers.each do |qualifier_key, qualifier_values|
8
+ qualifier_values.each_with_index do |qualifier_value, qualifier_index|
9
+ updated_qualifiers << {
10
+ claim_key: claim_key,
11
+ claim_index: claim_index,
12
+ qualifier_key: qualifier_key,
13
+ qualifier_index: qualifier_index
14
+ }
15
+ end
16
+ end
17
+ end
18
+ updated_qualifiers
19
+ end
20
+
21
+ # helper method for changed qualifiers
22
+ def self.handle_changed_qualifiers(current_claim, parent_claim, changed_qualifiers, added_qualifiers, removed_qualifiers, claim_key, claim_index)
23
+ current_qualifiers = current_claim["qualifiers"] ? current_claim["qualifiers"] : {}
24
+ parent_qualifiers = parent_claim["qualifiers"] ? parent_claim["qualifiers"] : {}
25
+
26
+ current_qualifiers.each do |qualifier_key, qualifier_values|
27
+ qualifier_values.each_with_index do |qualifier_value, qualifier_index|
28
+ if parent_qualifiers.key?(qualifier_key)
29
+ parent = parent_qualifiers[qualifier_key]
30
+ end
31
+ # Check if the qualifier index exists in the parent content
32
+ if !parent.nil?
33
+ parent = parent[qualifier_index]
34
+ # check if the parent claim was changed by comparing the objects first
35
+ if parent != qualifier_value
36
+ # Claim was changed
37
+ changed_qualifiers << {
38
+ claim_key: claim_key,
39
+ claim_index: claim_index,
40
+ qualifier_key: qualifier_key,
41
+ qualifier_index: qualifier_index
42
+ }
43
+ end
44
+ else
45
+ # Claim was added
46
+ added_qualifiers << {
47
+ claim_key: claim_key,
48
+ claim_index: claim_index,
49
+ qualifier_key: qualifier_key,
50
+ qualifier_index: qualifier_index
51
+ }
52
+ end
53
+ end
54
+ end
55
+ # Check for removed claims
56
+ parent_qualifiers.each do |qualifier_key, qualifier_values|
57
+ qualifier_values.each_with_index do |qualifier_value, qualifier_index|
58
+ if current_qualifiers.key?(qualifier_key)
59
+ current = current_qualifiers[qualifier_key]
60
+ end
61
+ # Check if the qualifier index exists in the current content
62
+ if !current.nil?
63
+ current = current[qualifier_index]
64
+ end
65
+ if current.nil?
66
+ # Claim was removed
67
+ removed_qualifiers << {
68
+ claim_key: claim_key,
69
+ claim_index: claim_index,
70
+ qualifier_key: qualifier_key,
71
+ qualifier_index: qualifier_index
72
+ }
73
+ end
74
+ end
75
+ end
76
+
77
+ {
78
+ added_qualifiers: added_qualifiers,
79
+ removed_qualifiers: removed_qualifiers,
80
+ changed_qualifiers: changed_qualifiers
81
+ }
82
+ end
83
+ end
@@ -0,0 +1,49 @@
1
+ class ReferenceAnalyzer
2
+ # helper method for adding and removing references
3
+ def self.reference_updates(claim, updated_references, claim_key, claim_index)
4
+ if claim["references"]
5
+ claim["references"].each_with_index do |current_ref, ref_index|
6
+ updated_references << { claim_key: claim_key, claim_index: claim_index, reference_index: ref_index }
7
+ end
8
+ end
9
+ updated_references
10
+ end
11
+
12
+ # helper method for changed references
13
+ def self.handle_changed_references(current_claim, parent_claim, changed_references, added_references, removed_references, claim_key, claim_index)
14
+ current_references = current_claim["references"] ? current_claim["references"] : []
15
+ parent_references = parent_claim["references"] ? parent_claim["references"] : []
16
+
17
+ current_references.each_with_index do |current_ref, ref_index|
18
+ if parent_references.empty?
19
+ added_references << { claim_key: claim_key, claim_index: claim_index, reference_index: ref_index }
20
+ elsif !parent_references.include?(current_ref)
21
+ added_references << { claim_key: claim_key, claim_index: claim_index, reference_index: ref_index }
22
+ elsif ref_modified?(current_ref, parent_references)
23
+ changed_references << { claim_key: claim_key, claim_index: claim_index, reference_index: ref_index }
24
+ end
25
+ end
26
+
27
+ parent_references.each_with_index do |parent_ref, ref_index|
28
+ if !current_references.include?(parent_ref)
29
+ removed_references << { claim_key: claim_key, claim_index: claim_index, reference_index: ref_index }
30
+ end
31
+ end
32
+
33
+ {
34
+ added_references: added_references,
35
+ removed_references: removed_references,
36
+ changed_references: changed_references
37
+ }
38
+ end
39
+
40
+ # helper method for checking if a reference has been modified
41
+ def self.ref_modified?(current_reference, parent_references)
42
+ parent_references.each do |parent_reference|
43
+ if current_reference["snaks"] != parent_reference["snaks"]
44
+ return true
45
+ end
46
+ end
47
+ false
48
+ end
49
+ end