chem_scanner 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +13 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yml +604 -0
  5. data/.ruby-gemset +1 -0
  6. data/.ruby-version +1 -0
  7. data/.travis.yml +5 -0
  8. data/CODE_OF_CONDUCT.md +74 -0
  9. data/Gemfile +20 -0
  10. data/LICENSE.txt +661 -0
  11. data/README.md +177 -0
  12. data/Rakefile +8 -0
  13. data/bin/console +14 -0
  14. data/bin/setup +8 -0
  15. data/chem_scanner.gemspec +43 -0
  16. data/lib/chem_scanner.rb +79 -0
  17. data/lib/chem_scanner/cdx.rb +67 -0
  18. data/lib/chem_scanner/cdxml.rb +72 -0
  19. data/lib/chem_scanner/chem_draw/cdx_reader.rb +101 -0
  20. data/lib/chem_scanner/chem_draw/node/base_node.rb +123 -0
  21. data/lib/chem_scanner/chem_draw/node/base_value.rb +257 -0
  22. data/lib/chem_scanner/chem_draw/node/bond.rb +100 -0
  23. data/lib/chem_scanner/chem_draw/node/bracket_attachment.rb +17 -0
  24. data/lib/chem_scanner/chem_draw/node/bracket_group.rb +32 -0
  25. data/lib/chem_scanner/chem_draw/node/chem_geometry.rb +58 -0
  26. data/lib/chem_scanner/chem_draw/node/color_table.rb +46 -0
  27. data/lib/chem_scanner/chem_draw/node/font_table.rb +54 -0
  28. data/lib/chem_scanner/chem_draw/node/fragment.rb +149 -0
  29. data/lib/chem_scanner/chem_draw/node/fragment_node.rb +145 -0
  30. data/lib/chem_scanner/chem_draw/node/graphic.rb +94 -0
  31. data/lib/chem_scanner/chem_draw/node/text.rb +242 -0
  32. data/lib/chem_scanner/chem_draw/parser.rb +214 -0
  33. data/lib/chem_scanner/chem_draw/yaml/cdx_objects.yaml +32 -0
  34. data/lib/chem_scanner/chem_draw/yaml/cdx_props.yaml +263 -0
  35. data/lib/chem_scanner/chem_draw/yaml/cdxml_objects.yaml +36 -0
  36. data/lib/chem_scanner/chem_draw/yaml/cdxml_props.yaml +263 -0
  37. data/lib/chem_scanner/chem_draw/yaml/props_data_type.yaml +263 -0
  38. data/lib/chem_scanner/configuration/abbreviation.rb +76 -0
  39. data/lib/chem_scanner/configuration/superatom.rb +76 -0
  40. data/lib/chem_scanner/configuration/superatom.txt +2874 -0
  41. data/lib/chem_scanner/configuration/util.rb +40 -0
  42. data/lib/chem_scanner/configuration/yaml/abbreviations.yaml +6399 -0
  43. data/lib/chem_scanner/configuration/yaml/elements.yaml +115 -0
  44. data/lib/chem_scanner/configuration/yaml/solvents.yaml +16 -0
  45. data/lib/chem_scanner/doc.rb +56 -0
  46. data/lib/chem_scanner/docx.rb +86 -0
  47. data/lib/chem_scanner/export/cml.rb +176 -0
  48. data/lib/chem_scanner/extension/element_map.rb +9 -0
  49. data/lib/chem_scanner/extension/geometry/bounding_box.rb +84 -0
  50. data/lib/chem_scanner/extension/geometry/line.rb +123 -0
  51. data/lib/chem_scanner/extension/geometry/point.rb +18 -0
  52. data/lib/chem_scanner/extension/geometry/polygon.rb +115 -0
  53. data/lib/chem_scanner/extension/geometry/segment.rb +196 -0
  54. data/lib/chem_scanner/extension/passthrough.rb +7 -0
  55. data/lib/chem_scanner/interpreter/element/arrow.rb +298 -0
  56. data/lib/chem_scanner/interpreter/element/atom.rb +134 -0
  57. data/lib/chem_scanner/interpreter/element/fragment.rb +59 -0
  58. data/lib/chem_scanner/interpreter/element/molecule.rb +473 -0
  59. data/lib/chem_scanner/interpreter/element/molecule_group.rb +34 -0
  60. data/lib/chem_scanner/interpreter/element/reaction.rb +186 -0
  61. data/lib/chem_scanner/interpreter/element/reaction_step.rb +39 -0
  62. data/lib/chem_scanner/interpreter/formula_to_mol.rb +75 -0
  63. data/lib/chem_scanner/interpreter/post_process/assemble.rb +38 -0
  64. data/lib/chem_scanner/interpreter/post_process/label_by_molecule.rb +37 -0
  65. data/lib/chem_scanner/interpreter/post_process/reaction_info.rb +225 -0
  66. data/lib/chem_scanner/interpreter/post_process/reaction_step.rb +95 -0
  67. data/lib/chem_scanner/interpreter/post_process/reagent_label.rb +46 -0
  68. data/lib/chem_scanner/interpreter/post_process/text_as_molecule.rb +52 -0
  69. data/lib/chem_scanner/interpreter/post_process/text_label.rb +40 -0
  70. data/lib/chem_scanner/interpreter/pre_process/arrow.rb +197 -0
  71. data/lib/chem_scanner/interpreter/pre_process/graphic.rb +41 -0
  72. data/lib/chem_scanner/interpreter/pre_process/molecule.rb +150 -0
  73. data/lib/chem_scanner/interpreter/reaction_detection/assign_to_reaction.rb +129 -0
  74. data/lib/chem_scanner/interpreter/reaction_detection/duplicate_reagents.rb +50 -0
  75. data/lib/chem_scanner/interpreter/reaction_detection/molecule_group.rb +55 -0
  76. data/lib/chem_scanner/interpreter/reaction_detection/multi_line_chain_reaction.rb +85 -0
  77. data/lib/chem_scanner/interpreter/reaction_detection/remove_separated_mol.rb +115 -0
  78. data/lib/chem_scanner/interpreter/reaction_detection/text_assignment.rb +166 -0
  79. data/lib/chem_scanner/interpreter/scheme.rb +173 -0
  80. data/lib/chem_scanner/interpreter/scheme_base.rb +64 -0
  81. data/lib/chem_scanner/interpreter/text_group/bold_groups.rb +183 -0
  82. data/lib/chem_scanner/interpreter/text_group/molecule_text_group.rb +138 -0
  83. data/lib/chem_scanner/interpreter/text_group/reaction_text_groups.rb +221 -0
  84. data/lib/chem_scanner/interpreter/text_group/retrieve_alias_info.rb +41 -0
  85. data/lib/chem_scanner/interpreter/text_group/retrieve_n_atoms.rb +106 -0
  86. data/lib/chem_scanner/interpreter/text_group/text_group_interpreter.rb +92 -0
  87. data/lib/chem_scanner/perkin_eln.rb +287 -0
  88. data/lib/chem_scanner/version.rb +5 -0
  89. data/lib/rubygems_plugin.rb +5 -0
  90. metadata +244 -0
@@ -0,0 +1,115 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ module Interpreter
5
+ using Extension
6
+
7
+ module ReactionDetection
8
+ # (1): A ---> C
9
+ #
10
+ # (2): B ---> D
11
+ # |
12
+ # |
13
+ # V
14
+ # E
15
+ # Remove C from (2)
16
+ def remove_separated_mol
17
+ dist_gap = 2.0
18
+
19
+ @reactions.each do |r|
20
+ arrow = @arrow_map[r.arrow_id]
21
+
22
+ %w[reactant_ids product_ids].each do |group|
23
+ rgroup = r.send(group)
24
+ next if rgroup.count < 2
25
+
26
+ # Distance map of 1 molecule to arrow
27
+ # and other molecules within group
28
+ dist_map = distance_molecule_group(rgroup, arrow, group)
29
+ min_dist = dist_map.min_by { |_, value| value }.last
30
+
31
+ remove_map = dist_map.select do |k, v|
32
+ dist_check = v > (dist_gap * min_dist)
33
+ next unless dist_check
34
+
35
+ in_other = @reactions.select do |other|
36
+ check = (
37
+ other.arrow_id != r.arrow_id &&
38
+ other.molecule_ids.include?(k)
39
+ )
40
+ next unless check
41
+
42
+ oarrow = @arrow_map[other.arrow_id]
43
+ !arrow.parallel_to?(oarrow)
44
+ end
45
+
46
+ in_other.count > 0
47
+ end
48
+ remove_keys = remove_map.keys
49
+
50
+ remove_map.each_key do |k|
51
+ mol = @mol_map[k]
52
+ next if mol.nil?
53
+
54
+ (rgroup - [k]).each do |id|
55
+ om = @mol_map[id]
56
+ next if om.nil?
57
+
58
+ d = Geometry.distance(mol.polygon.center, om.polygon.center)
59
+
60
+ remove_keys.push(id) if d < (dist_gap * min_dist)
61
+ end
62
+ end
63
+
64
+ rgroup.delete_if { |x| remove_keys.include?(x) }
65
+ end
66
+ end
67
+ end
68
+
69
+ def distance_molecule_group(rgroup, arrow, group)
70
+ dist_map = {}
71
+ intersect_points_with_line = ->(id, line) do
72
+ @mol_map[id].polygon.intersection_points_with_line(line)
73
+ end
74
+
75
+ if group == "reactant_ids"
76
+ apoint = arrow.tail
77
+ aline = arrow.tail_segment.to_line
78
+ else
79
+ apoint = arrow.head
80
+ aline = arrow.head_segment.to_line
81
+ end
82
+
83
+ rgroup.each do |id|
84
+ next unless @mol_map.key?(id)
85
+
86
+ # Distance to arrow
87
+ inter_points = intersect_points_with_line.call(id, aline)
88
+ da = 9_999_999
89
+ inter_points.each do |point|
90
+ length = Geometry.distance(apoint, point)
91
+ da = length if length < da
92
+ end
93
+
94
+ # Distance to other molecule within group
95
+ dmols = 9_999_999
96
+ (rgroup - [id]).each do |mid|
97
+ other = @mol_map[mid]
98
+ next if other.nil?
99
+
100
+ intersect_points_with_line.call(mid, aline).each do |op|
101
+ inter_points.each do |p|
102
+ length = Geometry.distance(p, op)
103
+ dmols = length if length < dmols
104
+ end
105
+ end
106
+ end
107
+
108
+ dist_map[id] = [da, dmols].min
109
+ end
110
+
111
+ dist_map
112
+ end
113
+ end
114
+ end
115
+ end
@@ -0,0 +1,166 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ module Interpreter
5
+ using Extension
6
+
7
+ module ReactionDetection
8
+ # Attach/bind text to molecule or arrow
9
+ def assign_text
10
+ tgroup_ids = @mol_group_map.keys
11
+ text_as_mol_ids = []
12
+
13
+ @text_map.each do |k, text|
14
+ group = try_detect_label_position(text)
15
+ center = text.polygon.center
16
+
17
+ min_mol = nearest_molecule(center)
18
+ min_arrow = nearest_arrow(text)
19
+ arrow = @arrow_map[min_arrow.key]
20
+
21
+ if arrow.nil?
22
+ mol_key = min_mol.key
23
+
24
+ if group.nil?
25
+ @mol_map[mol_key].text_ids.push(k)
26
+ else
27
+ text_as_mol_ids.push(id: k, mol: mol_key, group: group)
28
+ end
29
+
30
+ next
31
+ end
32
+
33
+ if min_mol.key.zero?
34
+ arrow.text_arr.push(min_arrow.key)
35
+ next
36
+ end
37
+
38
+ to_arrow = (
39
+ min_arrow.value < min_mol.value * 2.5 &&
40
+ text_around_arrow?(arrow, text, min_arrow.value)
41
+ )
42
+
43
+ if to_arrow
44
+ arrow.text_arr.push(k)
45
+ next
46
+ end
47
+
48
+ # Do not add a molecule-group text to molecule as description
49
+ @mol_map[min_mol.key].text_ids.push(k) unless tgroup_ids.include?(k)
50
+ end
51
+
52
+ text_as_mol_ids.each do |tinfo|
53
+ tid = tinfo[:id]
54
+ text = @text_map[tid]
55
+ mid = tinfo[:mol]
56
+ mol = @mol_map.values.detect { |m| m.label == text.bold_text }
57
+
58
+ if mol.nil?
59
+ @mol_map[mid].text_ids.push(tid)
60
+ else
61
+ rid = tinfo[:group].keys.first
62
+ group = tinfo[:group][rid]
63
+ reaction = @reactions.detect { |r| r.arrow_id == rid }
64
+ rgroup = reaction.send("#{group[0..-2]}_ids")
65
+ rgroup.push(mol.id).uniq!
66
+ end
67
+ end
68
+
69
+ @mol_map.each_value { |mol| assemble_molecule_text(mol) }
70
+ end
71
+
72
+ def try_detect_label_position(text)
73
+ return nil if text.value != text.bold_text
74
+
75
+ group_pos = {}
76
+ @reactions.each do |reaction|
77
+ rid = reaction.arrow_id
78
+ arrow = @arrow_map[rid]
79
+ group = detect_position(arrow, text.polygon)
80
+ next if group.nil?
81
+
82
+ group_pos[rid] = group
83
+ end
84
+
85
+ return nil unless group_pos.size == 1
86
+
87
+ pos = group_pos.values.first
88
+ return nil unless %w[reactants products].include?(pos)
89
+
90
+ group_pos
91
+ end
92
+
93
+ def nearest_molecule(point)
94
+ min_mol = OpenStruct.new(key: 0, value: 9_999_999)
95
+
96
+ @mol_map.each do |okey, mol|
97
+ dist = mol.min_distance_to_point(point)
98
+
99
+ if dist < min_mol.value
100
+ min_mol.key = okey
101
+ min_mol.value = dist
102
+ end
103
+ end
104
+
105
+ min_mol
106
+ end
107
+
108
+ def nearest_arrow(text)
109
+ min_arrow = OpenStruct.new(key: 0, value: 9_999_999)
110
+ tpoly = text.polygon
111
+
112
+ @arrow_map.each do |okey, arrow|
113
+ arrow.segments.each do |segment|
114
+ ppoint = segment.to_line.point_projection(tpoly.center)
115
+ seg_contains = segment.contains_point?(ppoint)
116
+ next unless seg_contains
117
+
118
+ dist = segment.distance_to_boundingbox(tpoly)
119
+
120
+ if dist < min_arrow.value
121
+ min_arrow.key = okey
122
+ min_arrow.value = dist
123
+ end
124
+ end
125
+ end
126
+
127
+ min_arrow
128
+ end
129
+
130
+ def text_around_arrow?(arrow, text, dist)
131
+ tpoly = text.polygon
132
+ is_middle = arrow.poly_in_middle?(text.polygon)
133
+ return false unless is_middle
134
+
135
+ pheight = [tpoly.width, tpoly.height].max
136
+ arrow.build_polygons(pheight + dist)
137
+ cur_height = arrow.height
138
+ arrow.build_polygons(cur_height)
139
+
140
+ tcenter = tpoly.center
141
+ reaction = @reactions.detect { |r| r.arrow_id == arrow.id }
142
+ arrow.segments.each do |aseg|
143
+ pseg = aseg.perpen_segment_via_point(tcenter)
144
+ check_contains = (
145
+ aseg.contains_point?(pseg.point1) ||
146
+ aseg.contains_point?(pseg.point2)
147
+ )
148
+ mol_ids = molecules_intersects_with_segment(pseg)
149
+ mol_ids = mol_ids - reaction.reagent_ids
150
+ return true if mol_ids.empty? && check_contains
151
+ end
152
+
153
+ false
154
+ end
155
+
156
+ def molecules_intersects_with_segment(segment)
157
+ ids = []
158
+ @mol_map.each do |key, mol|
159
+ ids.push(key) if segment.intersects_with_polygon?(mol.polygon)
160
+ end
161
+
162
+ ids
163
+ end
164
+ end
165
+ end
166
+ end
@@ -0,0 +1,173 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ module ChemScanner
5
+ module Interpreter
6
+ Gem.find_files("chem_scanner/interpreter/*/*.rb").each { |f| require f }
7
+
8
+ using Extension
9
+
10
+ # General scheme, contains all graphics (molecules, text, arrows ...)
11
+ class Scheme
12
+ attr_reader :mol_map, :text_map, :bracket_map, :reactions,
13
+ :n_atoms, :fragment_as_line
14
+
15
+ include PreProcess
16
+ include ReactionDetection
17
+ include PostProcess
18
+
19
+ def initialize(parser)
20
+ fragment_map = parser.fragment_map.map { |k, v| [k, Fragment.new(v)] }
21
+ @fragment_map = fragment_map.to_h
22
+ @fragment_group_map = parser.fragment_group_map
23
+
24
+ @geometry_map = parser.geometry_map
25
+ @graphic_map = parser.graphic_map
26
+
27
+ @text_map = parser.text_map
28
+ @bracket_map = parser.bracket_map
29
+
30
+ @mol_map = ElementMap.new
31
+ @mol_group_map = ElementMap.new
32
+
33
+ @arrow_map = ElementMap.new
34
+ # Segment or headless arrow
35
+ @segment_map = ElementMap.new
36
+
37
+ @mol_substitutes = {}
38
+ @reaction_substitutes = {}
39
+
40
+ @fragment_as_line = 0
41
+
42
+ @reactions = []
43
+ end
44
+
45
+ def interpret
46
+ pre_process
47
+ reaction_detection
48
+ post_process
49
+
50
+ tgi = TextGroupInterpreter.new(self)
51
+ # Detect if molecule has any n-atom, save those infos
52
+ tgi.retrieve_n_atoms_info
53
+
54
+ @n_atoms = tgi.n_atoms
55
+
56
+ # Retrieve rgroups, alias-groups of molecules
57
+ tgi.retrieve_alias_info
58
+
59
+ # - Find R-groups ("R1", "R2", "R", ...)
60
+ # - Find alias-groups ("X", "Y", "Ar", "M")
61
+ # - Detect label set ("2a,b" "3-6" ...)
62
+ # tgi.retrieve_labels_and_groups
63
+
64
+ # - Combine corresponding addition info detected molecule/reaction text
65
+ # e.g., "3: m = 1, R = H"
66
+ # - Interpret previouse retrieved data
67
+ # - Save those infos to generate molecules/reactions later
68
+ # interpret_labels_and_groups
69
+
70
+ # Try generate new molecules/reactions
71
+ # based on R-groups, alias-groups, n-atoms ...
72
+ tgi.generate_elements
73
+
74
+ @mol_group_map.each do |_, mgroup|
75
+ mgroup.molecules.each do |m|
76
+ @mol_map[m.id] = m unless @mol_map.key?(m.id)
77
+ end
78
+ end
79
+ end
80
+
81
+ def molecules
82
+ @mol_map.values
83
+ end
84
+
85
+ private
86
+
87
+ def pre_process
88
+ # Retrieve fragments which are covered by a rectangle
89
+ find_fragment_inside_rectangle
90
+
91
+ # - Attach detected above to arrow
92
+ # - Try to detect cross arrow ( --//--> or --X--> )
93
+ #
94
+ # -----|
95
+ # |
96
+ # V
97
+ # - Extend arrows if possible
98
+ #
99
+ # |------>
100
+ # |
101
+ # -----|
102
+ # |
103
+ # |------>
104
+ # - Split extend arrows if possible
105
+ refine_arrow
106
+
107
+ extract_fragment_graphic
108
+ refine_molecules
109
+ end
110
+
111
+ def reaction_detection
112
+ # Adding molecules based on molecules and arrow position
113
+ assign_to_reaction
114
+
115
+ # (1): A ---> C
116
+ #
117
+ # (2): B ---> D
118
+ # |
119
+ # |
120
+ # V
121
+ # E
122
+ # Remove C from (2)
123
+ #
124
+ # Remove if one molecule is seperated against other in the same group
125
+ # If it is too far, will consider it not a part of the reaction
126
+ remove_separated_mol
127
+
128
+ # Following current algorithm, reagents could belongs to multiple
129
+ # reactions. Only take the nearest one
130
+ refine_duplicate_reagents
131
+
132
+ # Attach text to molecule or arrow
133
+ # Process molecule label
134
+ assign_text
135
+
136
+ # Text can also be reactants/products.
137
+ # Process these ONLY IF text does not belong to any reaction or molecule
138
+ assign_molecule_group
139
+
140
+ # NOTE: Handle some specific scenario from here
141
+
142
+ # A -> B ->
143
+ # C- > D -> E
144
+ # For this case, we will have an extra implicit reaction: B -> C
145
+ # For now, only deal with this case if all arrows are horizontal
146
+ multi_line_chain_reaction
147
+ end
148
+
149
+ def post_process
150
+ # Check if there is any label inside reagents
151
+ # which is not assigned to any molecule
152
+ refine_reagents_label
153
+
154
+ # Label usually present a molecule, process those in reagents text
155
+ replace_label_by_molecule
156
+
157
+ # Text-as-label, e.g. "ligand = ", "amide = "
158
+ refine_text_label
159
+
160
+ refine_text_as_molecule
161
+
162
+ # From id => molecule
163
+ assemble_reaction
164
+
165
+ # - Extract reaction-related information: temperature, time, yield
166
+ # - Try interpret abbreviations
167
+ @reactions.each { |r| process_reaction_info(r) }
168
+
169
+ process_reactions_step
170
+ end
171
+ end
172
+ end
173
+ end