chem_scanner 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +13 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yml +604 -0
  5. data/.ruby-gemset +1 -0
  6. data/.ruby-version +1 -0
  7. data/.travis.yml +5 -0
  8. data/CODE_OF_CONDUCT.md +74 -0
  9. data/Gemfile +20 -0
  10. data/LICENSE.txt +661 -0
  11. data/README.md +177 -0
  12. data/Rakefile +8 -0
  13. data/bin/console +14 -0
  14. data/bin/setup +8 -0
  15. data/chem_scanner.gemspec +43 -0
  16. data/lib/chem_scanner.rb +79 -0
  17. data/lib/chem_scanner/cdx.rb +67 -0
  18. data/lib/chem_scanner/cdxml.rb +72 -0
  19. data/lib/chem_scanner/chem_draw/cdx_reader.rb +101 -0
  20. data/lib/chem_scanner/chem_draw/node/base_node.rb +123 -0
  21. data/lib/chem_scanner/chem_draw/node/base_value.rb +257 -0
  22. data/lib/chem_scanner/chem_draw/node/bond.rb +100 -0
  23. data/lib/chem_scanner/chem_draw/node/bracket_attachment.rb +17 -0
  24. data/lib/chem_scanner/chem_draw/node/bracket_group.rb +32 -0
  25. data/lib/chem_scanner/chem_draw/node/chem_geometry.rb +58 -0
  26. data/lib/chem_scanner/chem_draw/node/color_table.rb +46 -0
  27. data/lib/chem_scanner/chem_draw/node/font_table.rb +54 -0
  28. data/lib/chem_scanner/chem_draw/node/fragment.rb +149 -0
  29. data/lib/chem_scanner/chem_draw/node/fragment_node.rb +145 -0
  30. data/lib/chem_scanner/chem_draw/node/graphic.rb +94 -0
  31. data/lib/chem_scanner/chem_draw/node/text.rb +242 -0
  32. data/lib/chem_scanner/chem_draw/parser.rb +214 -0
  33. data/lib/chem_scanner/chem_draw/yaml/cdx_objects.yaml +32 -0
  34. data/lib/chem_scanner/chem_draw/yaml/cdx_props.yaml +263 -0
  35. data/lib/chem_scanner/chem_draw/yaml/cdxml_objects.yaml +36 -0
  36. data/lib/chem_scanner/chem_draw/yaml/cdxml_props.yaml +263 -0
  37. data/lib/chem_scanner/chem_draw/yaml/props_data_type.yaml +263 -0
  38. data/lib/chem_scanner/configuration/abbreviation.rb +76 -0
  39. data/lib/chem_scanner/configuration/superatom.rb +76 -0
  40. data/lib/chem_scanner/configuration/superatom.txt +2874 -0
  41. data/lib/chem_scanner/configuration/util.rb +40 -0
  42. data/lib/chem_scanner/configuration/yaml/abbreviations.yaml +6399 -0
  43. data/lib/chem_scanner/configuration/yaml/elements.yaml +115 -0
  44. data/lib/chem_scanner/configuration/yaml/solvents.yaml +16 -0
  45. data/lib/chem_scanner/doc.rb +56 -0
  46. data/lib/chem_scanner/docx.rb +86 -0
  47. data/lib/chem_scanner/export/cml.rb +176 -0
  48. data/lib/chem_scanner/extension/element_map.rb +9 -0
  49. data/lib/chem_scanner/extension/geometry/bounding_box.rb +84 -0
  50. data/lib/chem_scanner/extension/geometry/line.rb +123 -0
  51. data/lib/chem_scanner/extension/geometry/point.rb +18 -0
  52. data/lib/chem_scanner/extension/geometry/polygon.rb +115 -0
  53. data/lib/chem_scanner/extension/geometry/segment.rb +196 -0
  54. data/lib/chem_scanner/extension/passthrough.rb +7 -0
  55. data/lib/chem_scanner/interpreter/element/arrow.rb +298 -0
  56. data/lib/chem_scanner/interpreter/element/atom.rb +134 -0
  57. data/lib/chem_scanner/interpreter/element/fragment.rb +59 -0
  58. data/lib/chem_scanner/interpreter/element/molecule.rb +473 -0
  59. data/lib/chem_scanner/interpreter/element/molecule_group.rb +34 -0
  60. data/lib/chem_scanner/interpreter/element/reaction.rb +186 -0
  61. data/lib/chem_scanner/interpreter/element/reaction_step.rb +39 -0
  62. data/lib/chem_scanner/interpreter/formula_to_mol.rb +75 -0
  63. data/lib/chem_scanner/interpreter/post_process/assemble.rb +38 -0
  64. data/lib/chem_scanner/interpreter/post_process/label_by_molecule.rb +37 -0
  65. data/lib/chem_scanner/interpreter/post_process/reaction_info.rb +225 -0
  66. data/lib/chem_scanner/interpreter/post_process/reaction_step.rb +95 -0
  67. data/lib/chem_scanner/interpreter/post_process/reagent_label.rb +46 -0
  68. data/lib/chem_scanner/interpreter/post_process/text_as_molecule.rb +52 -0
  69. data/lib/chem_scanner/interpreter/post_process/text_label.rb +40 -0
  70. data/lib/chem_scanner/interpreter/pre_process/arrow.rb +197 -0
  71. data/lib/chem_scanner/interpreter/pre_process/graphic.rb +41 -0
  72. data/lib/chem_scanner/interpreter/pre_process/molecule.rb +150 -0
  73. data/lib/chem_scanner/interpreter/reaction_detection/assign_to_reaction.rb +129 -0
  74. data/lib/chem_scanner/interpreter/reaction_detection/duplicate_reagents.rb +50 -0
  75. data/lib/chem_scanner/interpreter/reaction_detection/molecule_group.rb +55 -0
  76. data/lib/chem_scanner/interpreter/reaction_detection/multi_line_chain_reaction.rb +85 -0
  77. data/lib/chem_scanner/interpreter/reaction_detection/remove_separated_mol.rb +115 -0
  78. data/lib/chem_scanner/interpreter/reaction_detection/text_assignment.rb +166 -0
  79. data/lib/chem_scanner/interpreter/scheme.rb +173 -0
  80. data/lib/chem_scanner/interpreter/scheme_base.rb +64 -0
  81. data/lib/chem_scanner/interpreter/text_group/bold_groups.rb +183 -0
  82. data/lib/chem_scanner/interpreter/text_group/molecule_text_group.rb +138 -0
  83. data/lib/chem_scanner/interpreter/text_group/reaction_text_groups.rb +221 -0
  84. data/lib/chem_scanner/interpreter/text_group/retrieve_alias_info.rb +41 -0
  85. data/lib/chem_scanner/interpreter/text_group/retrieve_n_atoms.rb +106 -0
  86. data/lib/chem_scanner/interpreter/text_group/text_group_interpreter.rb +92 -0
  87. data/lib/chem_scanner/perkin_eln.rb +287 -0
  88. data/lib/chem_scanner/version.rb +5 -0
  89. data/lib/rubygems_plugin.rb +5 -0
  90. metadata +244 -0
@@ -0,0 +1,150 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ module Interpreter
5
+ using Extension
6
+
7
+ module PreProcess
8
+ def refine_molecules
9
+ process_orbital_as_polymer
10
+ fragment_to_molecules
11
+ populate_molecule_info
12
+
13
+ assemble_ionic_molecule
14
+ end
15
+
16
+ def process_orbital_as_polymer
17
+ @graphic_map.each_value do |graphic|
18
+ next unless graphic.orbital_type == 256 && graphic.oval_type == 3
19
+
20
+ gpoly = graphic.polygon
21
+ next if gpoly.nil?
22
+
23
+ @fragment_map.each_value do |fragment|
24
+ fragment.node_map.each_value do |node|
25
+ next unless gpoly.contains?(node.point)
26
+
27
+ node.set_is_polymer
28
+ fragment.polygon = fragment.polygon.merge_polygon(gpoly)
29
+ end
30
+ end
31
+ end
32
+ end
33
+
34
+ def fragment_to_molecules
35
+ @fragment_map.each do |k, fragment|
36
+ next if fragment.node_map.count.zero?
37
+
38
+ mol = Molecule.new(fragment)
39
+ mol.process
40
+ @mol_map[k] = mol
41
+ end
42
+
43
+ @fragment_group_map.each do |k, fgroup|
44
+ mgroup = MoleculeGroup.new
45
+ mgroup.title = fgroup[:title]
46
+
47
+ fgroup[:fragment_map].each do |_, fragment|
48
+ # NOTE: nested fragment should not contain any special type.
49
+ # For instance, there are some cases that
50
+ # DMF is implicitly converted to C-C-C with nickname D-M-F
51
+ node_type = fragment.node_map.detect { |_, n| n.type.positive? }
52
+ mgroup.add_fragment(fragment) if node_type.nil?
53
+ end
54
+
55
+ @mol_group_map[k] = mgroup
56
+ end
57
+ end
58
+
59
+ def populate_molecule_info
60
+ list_mol = @mol_map.values
61
+ list_mol_group = @mol_group_map.values.reduce([]) do |acc, mgroup|
62
+ acc.concat(mgroup.molecules)
63
+ end
64
+
65
+ (list_mol + list_mol_group).each(&:update_output_formats)
66
+ end
67
+
68
+ def assemble_ionic_molecule
69
+ charged_mol = @mol_map.each_with_object([]) do |(k, mol), arr|
70
+ charged_ids = mol.charged_atom_ids
71
+ next arr unless charged_ids.size == 1
72
+
73
+ aid = charged_ids.first
74
+ charge = mol.atom_map[aid].charge
75
+ arr.push(mol: mol, aid: aid, charge: charge, mid: k)
76
+ end
77
+
78
+ charged_group = @mol_group_map.each_with_object([]) do |(k, group), arr|
79
+ next arr unless group.molecules.count == 1
80
+
81
+ mol = group.molecules.first
82
+ charged_ids = mol.charged_atom_ids
83
+ next arr unless charged_ids.count == 1
84
+
85
+ aid = charged_ids.first
86
+ charge = mol.atom_map[aid].charge
87
+ arr.push(mol: mol, aid: aid, charge: charge, mid: k)
88
+ end
89
+
90
+ list_mol = charged_mol.concat(charged_group)
91
+ grouped = {}
92
+ list_mol.each do |charged_info|
93
+ mol = charged_info[:mol]
94
+ charge = charged_info[:charge]
95
+ center = mol.polygon.bounding_box.center
96
+
97
+ others = list_mol.select { |ocharged| ocharged[:charge] == -charge }
98
+ opposite_mol = others.each_with_object(dist: 99999) do |minfo, obj|
99
+ ocenter = minfo[:mol].polygon.bounding_box.center
100
+ dist = Geometry.distance(center, ocenter)
101
+
102
+ if dist < obj[:dist]
103
+ obj[:dist] = dist
104
+ obj.merge!(mol: minfo[:mol], mid: minfo[:mid])
105
+ end
106
+ end
107
+ # Estimated value, could change later
108
+ next if opposite_mol[:mol].nil? || opposite_mol[:dist] > 4
109
+
110
+ mid = charged_info[:mid]
111
+ next if grouped.key?(mid) || grouped.value?(mid)
112
+
113
+ grouped[mid] = opposite_mol[:mid]
114
+ end
115
+
116
+ # { a1 => b, a2 => b, a3 => c } then remove both a1 and a2
117
+ values = []
118
+ dup_hash = {}
119
+ grouped.each do |key, okey|
120
+ values.push(okey) unless values.include?(okey)
121
+ dup_hash[okey] = (dup_hash[okey] || []).push(key)
122
+ end
123
+ dup_keys = dup_hash.values.select { |x| x.size > 1 }.flatten
124
+ grouped.delete_if { |k, _| dup_keys.include?(k) }
125
+
126
+ grouped.each do |key, okey|
127
+ get_mol = lambda do |id|
128
+ if @mol_map.key?(id)
129
+ @mol_map[id]
130
+ else
131
+ @mol_group_map[id].molecules.first
132
+ end
133
+ end
134
+
135
+ mol = get_mol.call(key)
136
+ omol = get_mol.call(okey)
137
+
138
+ mol.add(omol)
139
+ mol.update_output_formats
140
+ @mol_map.delete(okey)
141
+ mgid = @mol_group_map.delete(okey)
142
+ next if mgid.nil?
143
+
144
+ tid = mgid.title.id
145
+ @text_map.delete(tid)
146
+ end
147
+ end
148
+ end
149
+ end
150
+ end
@@ -0,0 +1,129 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ module Interpreter
5
+ using Extension
6
+
7
+ require "chem_scanner/interpreter/scheme_base"
8
+
9
+ module ReactionDetection
10
+ include ChemScanner::Interpreter::SchemeBase
11
+
12
+ def assign_to_reaction
13
+ undetected_molecules = {}
14
+
15
+ @arrow_map.each do |key, arrow|
16
+ reaction = Reaction.new
17
+ reaction.arrow_id = key
18
+ undetected = []
19
+
20
+ @mol_map.reject { |_, mol| mol.boxed }.each do |kmol, mol|
21
+ mpoly = mol.polygon
22
+
23
+ @arrow_map.each_value do |a|
24
+ dist = a.min_distance_to_polygon(mpoly)
25
+ a.build_polygons(mpoly.height + dist)
26
+ end
27
+
28
+ group = detect_position(arrow, mpoly)
29
+
30
+ case group
31
+ when "reagents" then reaction.reagent_ids.push(kmol)
32
+ when "reactants" then reaction.reactant_ids.push(kmol)
33
+ when "products" then reaction.product_ids.push(kmol)
34
+ else undetected.push(kmol)
35
+ end
36
+ end
37
+
38
+ @reactions.push(reaction)
39
+ undetected_molecules[key] = undetected unless undetected.empty?
40
+ end
41
+
42
+ # Molecules which are both reagents and reactants/products
43
+ # If reagent -> arrow distance in range, then consider as reagent
44
+ # Otherwise, consider as reactant/product
45
+ @reactions.each do |r|
46
+ reagent_ids = r.reagent_ids
47
+ arrow = @arrow_map[r.arrow_id]
48
+
49
+ others = @reactions.reject { |oreact| oreact.arrow_id == r.arrow_id }
50
+ others.each do |o|
51
+ common = reagent_ids & o.reactant_ids
52
+ common += reagent_ids & o.product_ids
53
+ common.each do |cid|
54
+ mol = @mol_map[cid]
55
+ dist = arrow.min_distance_to_polygon(mol.polygon)
56
+ target = dist > 2 ? r : o
57
+ target.delete_id(cid)
58
+ end
59
+ end
60
+ end
61
+
62
+ auto_fit_arrow_polygons
63
+
64
+ undetected_molecules.each do |rkey, ids|
65
+ reaction = @reactions.detect { |r| r.arrow_id == rkey }
66
+ arrow = @arrow_map[rkey]
67
+
68
+ ids.each do |id|
69
+ mol = @mol_map[id]
70
+ mpoly = mol.polygon
71
+ group = detect_position(arrow, mpoly)
72
+
73
+ case group
74
+ when "reagents" then reaction.reagent_ids.push(id)
75
+ when "reactants" then reaction.reactant_ids.push(id)
76
+ when "products" then reaction.product_ids.push(id)
77
+ end
78
+ end
79
+ end
80
+ end
81
+
82
+ def detect_position(arrow, mol_poly)
83
+ mcenter = mol_poly.center
84
+
85
+ check_pos = check_position(mol_poly, arrow)
86
+ return "products" if check_pos && arrow.product_side?(mcenter)
87
+
88
+ check_pos = check_position(mol_poly, arrow, false)
89
+ return "reactants" if check_pos && arrow.reactant_side?(mcenter)
90
+
91
+ return "reagents" if arrow.polygon_around?(mol_poly)
92
+
93
+ nil
94
+ end
95
+
96
+ # Check if molecule belong to reaction
97
+ def check_position(mol_poly, arrow, prod_side = true)
98
+ arrow_segment = ->(larrow) do
99
+ prod_side ? larrow.head_segment : larrow.tail_segment
100
+ end
101
+
102
+ segment = arrow_segment.call(arrow)
103
+ sline = segment.to_line
104
+ inter = sline.intersects_with_polygon?(mol_poly)
105
+ return false unless inter
106
+
107
+ inter_point = sline.intersection_points_with_polygon(mol_poly).first
108
+ inter_seg = Geometry::Segment.new(segment.point2, inter_point)
109
+
110
+ @arrow_map.except(arrow.id).each_value do |oarrow|
111
+ other_hseg = oarrow.head_segment
112
+ check_contains = (
113
+ other_hseg.contains_segment?(segment) ||
114
+ segment.contains_segment?(other_hseg)
115
+ )
116
+ next if check_contains
117
+
118
+ osegment = arrow_segment.call(oarrow)
119
+ check = osegment.to_line.intersects_with_polygon?(mol_poly) && \
120
+ oarrow.all_intersects_with_segment?(inter_seg)
121
+
122
+ return false if check
123
+ end
124
+
125
+ true
126
+ end
127
+ end
128
+ end
129
+ end
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ # Interpret the parsed/extracted geometry block
5
+ module Interpreter
6
+ using Extension
7
+
8
+ module ReactionDetection
9
+ def refine_duplicate_reagents
10
+ delete_info = []
11
+
12
+ @reactions.each do |r|
13
+ arrow = @arrow_map[r.arrow_id]
14
+ rremain = @reactions.reject { |other| other.arrow_id == r.arrow_id }
15
+
16
+ rremain.each do |other|
17
+ other_rps = other.reactant_ids + other.product_ids
18
+ r.reagent_ids -= other_rps
19
+
20
+ dup_ids = r.reagent_ids & other.reagent_ids
21
+ next if dup_ids.empty?
22
+
23
+ dup_ids.each do |id|
24
+ obj = @mol_map.key?(id) ? @mol_map[id] : @text_map[id]
25
+
26
+ polygon = obj.polygon
27
+ pcenter = polygon.center
28
+ apoint = arrow.contains_point?(pcenter)
29
+ opoint = @arrow_map[other.arrow_id].contains_point?(pcenter)
30
+ next if apoint.nil? || opoint.nil?
31
+
32
+ rdist = pcenter.distance_to(apoint)
33
+ odist = pcenter.distance_to(opoint)
34
+
35
+ if rdist > odist
36
+ info = OpenStruct.new(rid: r.arrow_id, id: id)
37
+ delete_info.push(info)
38
+ end
39
+ end
40
+ end
41
+ end
42
+
43
+ delete_info.each do |info|
44
+ reaction = @reactions.detect { |r| r.arrow_id == info.rid }
45
+ reaction.delete_id(info.id)
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ module Interpreter
5
+ using Extension
6
+
7
+ module ReactionDetection
8
+ require "chem_scanner/interpreter/reaction_detection/text_assignment"
9
+
10
+ def assign_molecule_group
11
+ all_reagent_ids = @reactions.reduce([]) do |acc, r|
12
+ acc.concat(@arrow_map[r.arrow_id].text_arr)
13
+ end
14
+
15
+ auto_fit_arrow_polygons
16
+
17
+ @mol_group_map.select do |tid, mgroup|
18
+ (
19
+ !all_reagent_ids.include?(tid) &&
20
+ mgroup.molecules.count == 1 &&
21
+ !mgroup.molecules.first.boxed
22
+ )
23
+ end.each do |mkey, mgroup|
24
+ mol = mgroup.molecules.first
25
+ mmid = mol.fragment.id
26
+
27
+ mgroup_pos = {}
28
+ @reactions.each do |reaction|
29
+ rid = reaction.arrow_id
30
+ arrow = @arrow_map[rid]
31
+ group = detect_position(arrow, mgroup.title.polygon)
32
+ next if group.nil?
33
+
34
+ mgroup_pos[rid] = group
35
+ end
36
+
37
+ pos = mgroup_pos.detect { |_, p| p == "reagents" }
38
+ next unless pos.nil?
39
+
40
+ pos = mgroup_pos.detect { |_, p| %w[reactants products].include?(p) }
41
+ next if pos.nil?
42
+
43
+ # Don't need to keep it text_map anymore
44
+ mol.text = @text_map.delete(mkey).value unless mgroup_pos.empty?
45
+ mol.text_ids.delete(mkey)
46
+ @mol_map.each_value { |m| m.text_ids.delete(mkey) }
47
+
48
+ reaction = @reactions.detect { |r| r.arrow_id == pos[0] }
49
+ group_ids = reaction.send("#{pos[1][0...-1]}_ids")
50
+ group_ids.push(mmid)
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,85 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ module Interpreter
5
+ using Extension
6
+
7
+ module ReactionDetection
8
+ def multi_line_chain_reaction
9
+ return if check_reaction_orderring
10
+
11
+ rarray = @reactions.select do |r|
12
+ r.reactant_ids.count.zero? || r.product_ids.count.zero?
13
+ end
14
+
15
+ rcount = rarray.count
16
+ return if rcount.zero?
17
+
18
+ auto_fit_arrow_polygons
19
+
20
+ sorted_akey = sort_arrow_map
21
+
22
+ get_reaction = ->(id) { @reactions.detect { |r| r.arrow_id == id } }
23
+
24
+ rarray.each do |reaction|
25
+ rkey = sorted_akey.find_index do |key_arr|
26
+ key_arr.include?(reaction.arrow_id)
27
+ end
28
+ next if rkey.nil?
29
+
30
+ if reaction.reactant_ids.count.zero?
31
+ other_ids = sorted_akey[rkey - 1]
32
+ next if other_ids.nil?
33
+
34
+ other_id = other_ids.last
35
+ other = get_reaction.call(other_id)
36
+ reaction.reactant_ids.concat(other.product_ids)
37
+ else
38
+ other_ids = sorted_akey[rkey + 1]
39
+ next if other_ids.nil?
40
+
41
+ other_id = other_ids.first
42
+ other = get_reaction.call(other_id)
43
+ reaction.product_ids.concat(other.reactant_ids)
44
+ end
45
+ end
46
+ end
47
+
48
+ def check_reaction_orderring
49
+ return true if @arrow_map.count < 2
50
+
51
+ @arrow_map.each_value do |arrow|
52
+ return true if arrow.middle_points.count > 0
53
+ return true unless arrow.head_segment.to_line.horizontal?
54
+ end
55
+
56
+ false
57
+ end
58
+
59
+ def sort_arrow_map
60
+ sorted_arr = []
61
+ arrow_keys = @arrow_map.keys
62
+
63
+ while !arrow_keys.empty?
64
+ arrow = @arrow_map[arrow_keys.first]
65
+ aheight = arrow.height
66
+ min_height = arrow.head.y - aheight
67
+ max_height = arrow.head.y + aheight
68
+
69
+ akeys = arrow_keys.select do |ak|
70
+ y_head = @arrow_map[ak].head.y
71
+ y_head >= min_height && y_head <= max_height
72
+ end
73
+
74
+ sorted_arr.push(akeys)
75
+ arrow_keys = arrow_keys - akeys
76
+ end
77
+
78
+ sorted_arr.map! { |arr| arr.sort_by! { |id| @arrow_map[id].head.x } }
79
+ sorted_arr.sort_by! { |arr| - @arrow_map[arr.first].head.y }
80
+
81
+ sorted_arr
82
+ end
83
+ end
84
+ end
85
+ end