chem_scanner 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (90) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +13 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yml +604 -0
  5. data/.ruby-gemset +1 -0
  6. data/.ruby-version +1 -0
  7. data/.travis.yml +5 -0
  8. data/CODE_OF_CONDUCT.md +74 -0
  9. data/Gemfile +20 -0
  10. data/LICENSE.txt +661 -0
  11. data/README.md +177 -0
  12. data/Rakefile +8 -0
  13. data/bin/console +14 -0
  14. data/bin/setup +8 -0
  15. data/chem_scanner.gemspec +43 -0
  16. data/lib/chem_scanner.rb +79 -0
  17. data/lib/chem_scanner/cdx.rb +67 -0
  18. data/lib/chem_scanner/cdxml.rb +72 -0
  19. data/lib/chem_scanner/chem_draw/cdx_reader.rb +101 -0
  20. data/lib/chem_scanner/chem_draw/node/base_node.rb +123 -0
  21. data/lib/chem_scanner/chem_draw/node/base_value.rb +257 -0
  22. data/lib/chem_scanner/chem_draw/node/bond.rb +100 -0
  23. data/lib/chem_scanner/chem_draw/node/bracket_attachment.rb +17 -0
  24. data/lib/chem_scanner/chem_draw/node/bracket_group.rb +32 -0
  25. data/lib/chem_scanner/chem_draw/node/chem_geometry.rb +58 -0
  26. data/lib/chem_scanner/chem_draw/node/color_table.rb +46 -0
  27. data/lib/chem_scanner/chem_draw/node/font_table.rb +54 -0
  28. data/lib/chem_scanner/chem_draw/node/fragment.rb +149 -0
  29. data/lib/chem_scanner/chem_draw/node/fragment_node.rb +145 -0
  30. data/lib/chem_scanner/chem_draw/node/graphic.rb +94 -0
  31. data/lib/chem_scanner/chem_draw/node/text.rb +242 -0
  32. data/lib/chem_scanner/chem_draw/parser.rb +214 -0
  33. data/lib/chem_scanner/chem_draw/yaml/cdx_objects.yaml +32 -0
  34. data/lib/chem_scanner/chem_draw/yaml/cdx_props.yaml +263 -0
  35. data/lib/chem_scanner/chem_draw/yaml/cdxml_objects.yaml +36 -0
  36. data/lib/chem_scanner/chem_draw/yaml/cdxml_props.yaml +263 -0
  37. data/lib/chem_scanner/chem_draw/yaml/props_data_type.yaml +263 -0
  38. data/lib/chem_scanner/configuration/abbreviation.rb +76 -0
  39. data/lib/chem_scanner/configuration/superatom.rb +76 -0
  40. data/lib/chem_scanner/configuration/superatom.txt +2874 -0
  41. data/lib/chem_scanner/configuration/util.rb +40 -0
  42. data/lib/chem_scanner/configuration/yaml/abbreviations.yaml +6399 -0
  43. data/lib/chem_scanner/configuration/yaml/elements.yaml +115 -0
  44. data/lib/chem_scanner/configuration/yaml/solvents.yaml +16 -0
  45. data/lib/chem_scanner/doc.rb +56 -0
  46. data/lib/chem_scanner/docx.rb +86 -0
  47. data/lib/chem_scanner/export/cml.rb +176 -0
  48. data/lib/chem_scanner/extension/element_map.rb +9 -0
  49. data/lib/chem_scanner/extension/geometry/bounding_box.rb +84 -0
  50. data/lib/chem_scanner/extension/geometry/line.rb +123 -0
  51. data/lib/chem_scanner/extension/geometry/point.rb +18 -0
  52. data/lib/chem_scanner/extension/geometry/polygon.rb +115 -0
  53. data/lib/chem_scanner/extension/geometry/segment.rb +196 -0
  54. data/lib/chem_scanner/extension/passthrough.rb +7 -0
  55. data/lib/chem_scanner/interpreter/element/arrow.rb +298 -0
  56. data/lib/chem_scanner/interpreter/element/atom.rb +134 -0
  57. data/lib/chem_scanner/interpreter/element/fragment.rb +59 -0
  58. data/lib/chem_scanner/interpreter/element/molecule.rb +473 -0
  59. data/lib/chem_scanner/interpreter/element/molecule_group.rb +34 -0
  60. data/lib/chem_scanner/interpreter/element/reaction.rb +186 -0
  61. data/lib/chem_scanner/interpreter/element/reaction_step.rb +39 -0
  62. data/lib/chem_scanner/interpreter/formula_to_mol.rb +75 -0
  63. data/lib/chem_scanner/interpreter/post_process/assemble.rb +38 -0
  64. data/lib/chem_scanner/interpreter/post_process/label_by_molecule.rb +37 -0
  65. data/lib/chem_scanner/interpreter/post_process/reaction_info.rb +225 -0
  66. data/lib/chem_scanner/interpreter/post_process/reaction_step.rb +95 -0
  67. data/lib/chem_scanner/interpreter/post_process/reagent_label.rb +46 -0
  68. data/lib/chem_scanner/interpreter/post_process/text_as_molecule.rb +52 -0
  69. data/lib/chem_scanner/interpreter/post_process/text_label.rb +40 -0
  70. data/lib/chem_scanner/interpreter/pre_process/arrow.rb +197 -0
  71. data/lib/chem_scanner/interpreter/pre_process/graphic.rb +41 -0
  72. data/lib/chem_scanner/interpreter/pre_process/molecule.rb +150 -0
  73. data/lib/chem_scanner/interpreter/reaction_detection/assign_to_reaction.rb +129 -0
  74. data/lib/chem_scanner/interpreter/reaction_detection/duplicate_reagents.rb +50 -0
  75. data/lib/chem_scanner/interpreter/reaction_detection/molecule_group.rb +55 -0
  76. data/lib/chem_scanner/interpreter/reaction_detection/multi_line_chain_reaction.rb +85 -0
  77. data/lib/chem_scanner/interpreter/reaction_detection/remove_separated_mol.rb +115 -0
  78. data/lib/chem_scanner/interpreter/reaction_detection/text_assignment.rb +166 -0
  79. data/lib/chem_scanner/interpreter/scheme.rb +173 -0
  80. data/lib/chem_scanner/interpreter/scheme_base.rb +64 -0
  81. data/lib/chem_scanner/interpreter/text_group/bold_groups.rb +183 -0
  82. data/lib/chem_scanner/interpreter/text_group/molecule_text_group.rb +138 -0
  83. data/lib/chem_scanner/interpreter/text_group/reaction_text_groups.rb +221 -0
  84. data/lib/chem_scanner/interpreter/text_group/retrieve_alias_info.rb +41 -0
  85. data/lib/chem_scanner/interpreter/text_group/retrieve_n_atoms.rb +106 -0
  86. data/lib/chem_scanner/interpreter/text_group/text_group_interpreter.rb +92 -0
  87. data/lib/chem_scanner/perkin_eln.rb +287 -0
  88. data/lib/chem_scanner/version.rb +5 -0
  89. data/lib/rubygems_plugin.rb +5 -0
  90. metadata +244 -0
@@ -0,0 +1,115 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ module Interpreter
5
+ using Extension
6
+
7
+ module ReactionDetection
8
+ # (1): A ---> C
9
+ #
10
+ # (2): B ---> D
11
+ # |
12
+ # |
13
+ # V
14
+ # E
15
+ # Remove C from (2)
16
+ def remove_separated_mol
17
+ dist_gap = 2.0
18
+
19
+ @reactions.each do |r|
20
+ arrow = @arrow_map[r.arrow_id]
21
+
22
+ %w[reactant_ids product_ids].each do |group|
23
+ rgroup = r.send(group)
24
+ next if rgroup.count < 2
25
+
26
+ # Distance map of 1 molecule to arrow
27
+ # and other molecules within group
28
+ dist_map = distance_molecule_group(rgroup, arrow, group)
29
+ min_dist = dist_map.min_by { |_, value| value }.last
30
+
31
+ remove_map = dist_map.select do |k, v|
32
+ dist_check = v > (dist_gap * min_dist)
33
+ next unless dist_check
34
+
35
+ in_other = @reactions.select do |other|
36
+ check = (
37
+ other.arrow_id != r.arrow_id &&
38
+ other.molecule_ids.include?(k)
39
+ )
40
+ next unless check
41
+
42
+ oarrow = @arrow_map[other.arrow_id]
43
+ !arrow.parallel_to?(oarrow)
44
+ end
45
+
46
+ in_other.count > 0
47
+ end
48
+ remove_keys = remove_map.keys
49
+
50
+ remove_map.each_key do |k|
51
+ mol = @mol_map[k]
52
+ next if mol.nil?
53
+
54
+ (rgroup - [k]).each do |id|
55
+ om = @mol_map[id]
56
+ next if om.nil?
57
+
58
+ d = Geometry.distance(mol.polygon.center, om.polygon.center)
59
+
60
+ remove_keys.push(id) if d < (dist_gap * min_dist)
61
+ end
62
+ end
63
+
64
+ rgroup.delete_if { |x| remove_keys.include?(x) }
65
+ end
66
+ end
67
+ end
68
+
69
+ def distance_molecule_group(rgroup, arrow, group)
70
+ dist_map = {}
71
+ intersect_points_with_line = ->(id, line) do
72
+ @mol_map[id].polygon.intersection_points_with_line(line)
73
+ end
74
+
75
+ if group == "reactant_ids"
76
+ apoint = arrow.tail
77
+ aline = arrow.tail_segment.to_line
78
+ else
79
+ apoint = arrow.head
80
+ aline = arrow.head_segment.to_line
81
+ end
82
+
83
+ rgroup.each do |id|
84
+ next unless @mol_map.key?(id)
85
+
86
+ # Distance to arrow
87
+ inter_points = intersect_points_with_line.call(id, aline)
88
+ da = 9_999_999
89
+ inter_points.each do |point|
90
+ length = Geometry.distance(apoint, point)
91
+ da = length if length < da
92
+ end
93
+
94
+ # Distance to other molecule within group
95
+ dmols = 9_999_999
96
+ (rgroup - [id]).each do |mid|
97
+ other = @mol_map[mid]
98
+ next if other.nil?
99
+
100
+ intersect_points_with_line.call(mid, aline).each do |op|
101
+ inter_points.each do |p|
102
+ length = Geometry.distance(p, op)
103
+ dmols = length if length < dmols
104
+ end
105
+ end
106
+ end
107
+
108
+ dist_map[id] = [da, dmols].min
109
+ end
110
+
111
+ dist_map
112
+ end
113
+ end
114
+ end
115
+ end
@@ -0,0 +1,166 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ module Interpreter
5
+ using Extension
6
+
7
+ module ReactionDetection
8
+ # Attach/bind text to molecule or arrow
9
+ def assign_text
10
+ tgroup_ids = @mol_group_map.keys
11
+ text_as_mol_ids = []
12
+
13
+ @text_map.each do |k, text|
14
+ group = try_detect_label_position(text)
15
+ center = text.polygon.center
16
+
17
+ min_mol = nearest_molecule(center)
18
+ min_arrow = nearest_arrow(text)
19
+ arrow = @arrow_map[min_arrow.key]
20
+
21
+ if arrow.nil?
22
+ mol_key = min_mol.key
23
+
24
+ if group.nil?
25
+ @mol_map[mol_key].text_ids.push(k)
26
+ else
27
+ text_as_mol_ids.push(id: k, mol: mol_key, group: group)
28
+ end
29
+
30
+ next
31
+ end
32
+
33
+ if min_mol.key.zero?
34
+ arrow.text_arr.push(min_arrow.key)
35
+ next
36
+ end
37
+
38
+ to_arrow = (
39
+ min_arrow.value < min_mol.value * 2.5 &&
40
+ text_around_arrow?(arrow, text, min_arrow.value)
41
+ )
42
+
43
+ if to_arrow
44
+ arrow.text_arr.push(k)
45
+ next
46
+ end
47
+
48
+ # Do not add a molecule-group text to molecule as description
49
+ @mol_map[min_mol.key].text_ids.push(k) unless tgroup_ids.include?(k)
50
+ end
51
+
52
+ text_as_mol_ids.each do |tinfo|
53
+ tid = tinfo[:id]
54
+ text = @text_map[tid]
55
+ mid = tinfo[:mol]
56
+ mol = @mol_map.values.detect { |m| m.label == text.bold_text }
57
+
58
+ if mol.nil?
59
+ @mol_map[mid].text_ids.push(tid)
60
+ else
61
+ rid = tinfo[:group].keys.first
62
+ group = tinfo[:group][rid]
63
+ reaction = @reactions.detect { |r| r.arrow_id == rid }
64
+ rgroup = reaction.send("#{group[0..-2]}_ids")
65
+ rgroup.push(mol.id).uniq!
66
+ end
67
+ end
68
+
69
+ @mol_map.each_value { |mol| assemble_molecule_text(mol) }
70
+ end
71
+
72
+ def try_detect_label_position(text)
73
+ return nil if text.value != text.bold_text
74
+
75
+ group_pos = {}
76
+ @reactions.each do |reaction|
77
+ rid = reaction.arrow_id
78
+ arrow = @arrow_map[rid]
79
+ group = detect_position(arrow, text.polygon)
80
+ next if group.nil?
81
+
82
+ group_pos[rid] = group
83
+ end
84
+
85
+ return nil unless group_pos.size == 1
86
+
87
+ pos = group_pos.values.first
88
+ return nil unless %w[reactants products].include?(pos)
89
+
90
+ group_pos
91
+ end
92
+
93
+ def nearest_molecule(point)
94
+ min_mol = OpenStruct.new(key: 0, value: 9_999_999)
95
+
96
+ @mol_map.each do |okey, mol|
97
+ dist = mol.min_distance_to_point(point)
98
+
99
+ if dist < min_mol.value
100
+ min_mol.key = okey
101
+ min_mol.value = dist
102
+ end
103
+ end
104
+
105
+ min_mol
106
+ end
107
+
108
+ def nearest_arrow(text)
109
+ min_arrow = OpenStruct.new(key: 0, value: 9_999_999)
110
+ tpoly = text.polygon
111
+
112
+ @arrow_map.each do |okey, arrow|
113
+ arrow.segments.each do |segment|
114
+ ppoint = segment.to_line.point_projection(tpoly.center)
115
+ seg_contains = segment.contains_point?(ppoint)
116
+ next unless seg_contains
117
+
118
+ dist = segment.distance_to_boundingbox(tpoly)
119
+
120
+ if dist < min_arrow.value
121
+ min_arrow.key = okey
122
+ min_arrow.value = dist
123
+ end
124
+ end
125
+ end
126
+
127
+ min_arrow
128
+ end
129
+
130
+ def text_around_arrow?(arrow, text, dist)
131
+ tpoly = text.polygon
132
+ is_middle = arrow.poly_in_middle?(text.polygon)
133
+ return false unless is_middle
134
+
135
+ pheight = [tpoly.width, tpoly.height].max
136
+ arrow.build_polygons(pheight + dist)
137
+ cur_height = arrow.height
138
+ arrow.build_polygons(cur_height)
139
+
140
+ tcenter = tpoly.center
141
+ reaction = @reactions.detect { |r| r.arrow_id == arrow.id }
142
+ arrow.segments.each do |aseg|
143
+ pseg = aseg.perpen_segment_via_point(tcenter)
144
+ check_contains = (
145
+ aseg.contains_point?(pseg.point1) ||
146
+ aseg.contains_point?(pseg.point2)
147
+ )
148
+ mol_ids = molecules_intersects_with_segment(pseg)
149
+ mol_ids = mol_ids - reaction.reagent_ids
150
+ return true if mol_ids.empty? && check_contains
151
+ end
152
+
153
+ false
154
+ end
155
+
156
+ def molecules_intersects_with_segment(segment)
157
+ ids = []
158
+ @mol_map.each do |key, mol|
159
+ ids.push(key) if segment.intersects_with_polygon?(mol.polygon)
160
+ end
161
+
162
+ ids
163
+ end
164
+ end
165
+ end
166
+ end
@@ -0,0 +1,173 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ module ChemScanner
5
+ module Interpreter
6
+ Gem.find_files("chem_scanner/interpreter/*/*.rb").each { |f| require f }
7
+
8
+ using Extension
9
+
10
+ # General scheme, contains all graphics (molecules, text, arrows ...)
11
+ class Scheme
12
+ attr_reader :mol_map, :text_map, :bracket_map, :reactions,
13
+ :n_atoms, :fragment_as_line
14
+
15
+ include PreProcess
16
+ include ReactionDetection
17
+ include PostProcess
18
+
19
+ def initialize(parser)
20
+ fragment_map = parser.fragment_map.map { |k, v| [k, Fragment.new(v)] }
21
+ @fragment_map = fragment_map.to_h
22
+ @fragment_group_map = parser.fragment_group_map
23
+
24
+ @geometry_map = parser.geometry_map
25
+ @graphic_map = parser.graphic_map
26
+
27
+ @text_map = parser.text_map
28
+ @bracket_map = parser.bracket_map
29
+
30
+ @mol_map = ElementMap.new
31
+ @mol_group_map = ElementMap.new
32
+
33
+ @arrow_map = ElementMap.new
34
+ # Segment or headless arrow
35
+ @segment_map = ElementMap.new
36
+
37
+ @mol_substitutes = {}
38
+ @reaction_substitutes = {}
39
+
40
+ @fragment_as_line = 0
41
+
42
+ @reactions = []
43
+ end
44
+
45
+ def interpret
46
+ pre_process
47
+ reaction_detection
48
+ post_process
49
+
50
+ tgi = TextGroupInterpreter.new(self)
51
+ # Detect if molecule has any n-atom, save those infos
52
+ tgi.retrieve_n_atoms_info
53
+
54
+ @n_atoms = tgi.n_atoms
55
+
56
+ # Retrieve rgroups, alias-groups of molecules
57
+ tgi.retrieve_alias_info
58
+
59
+ # - Find R-groups ("R1", "R2", "R", ...)
60
+ # - Find alias-groups ("X", "Y", "Ar", "M")
61
+ # - Detect label set ("2a,b" "3-6" ...)
62
+ # tgi.retrieve_labels_and_groups
63
+
64
+ # - Combine corresponding addition info detected molecule/reaction text
65
+ # e.g., "3: m = 1, R = H"
66
+ # - Interpret previouse retrieved data
67
+ # - Save those infos to generate molecules/reactions later
68
+ # interpret_labels_and_groups
69
+
70
+ # Try generate new molecules/reactions
71
+ # based on R-groups, alias-groups, n-atoms ...
72
+ tgi.generate_elements
73
+
74
+ @mol_group_map.each do |_, mgroup|
75
+ mgroup.molecules.each do |m|
76
+ @mol_map[m.id] = m unless @mol_map.key?(m.id)
77
+ end
78
+ end
79
+ end
80
+
81
+ def molecules
82
+ @mol_map.values
83
+ end
84
+
85
+ private
86
+
87
+ def pre_process
88
+ # Retrieve fragments which are covered by a rectangle
89
+ find_fragment_inside_rectangle
90
+
91
+ # - Attach detected above to arrow
92
+ # - Try to detect cross arrow ( --//--> or --X--> )
93
+ #
94
+ # -----|
95
+ # |
96
+ # V
97
+ # - Extend arrows if possible
98
+ #
99
+ # |------>
100
+ # |
101
+ # -----|
102
+ # |
103
+ # |------>
104
+ # - Split extend arrows if possible
105
+ refine_arrow
106
+
107
+ extract_fragment_graphic
108
+ refine_molecules
109
+ end
110
+
111
+ def reaction_detection
112
+ # Adding molecules based on molecules and arrow position
113
+ assign_to_reaction
114
+
115
+ # (1): A ---> C
116
+ #
117
+ # (2): B ---> D
118
+ # |
119
+ # |
120
+ # V
121
+ # E
122
+ # Remove C from (2)
123
+ #
124
+ # Remove if one molecule is seperated against other in the same group
125
+ # If it is too far, will consider it not a part of the reaction
126
+ remove_separated_mol
127
+
128
+ # Following current algorithm, reagents could belongs to multiple
129
+ # reactions. Only take the nearest one
130
+ refine_duplicate_reagents
131
+
132
+ # Attach text to molecule or arrow
133
+ # Process molecule label
134
+ assign_text
135
+
136
+ # Text can also be reactants/products.
137
+ # Process these ONLY IF text does not belong to any reaction or molecule
138
+ assign_molecule_group
139
+
140
+ # NOTE: Handle some specific scenario from here
141
+
142
+ # A -> B ->
143
+ # C- > D -> E
144
+ # For this case, we will have an extra implicit reaction: B -> C
145
+ # For now, only deal with this case if all arrows are horizontal
146
+ multi_line_chain_reaction
147
+ end
148
+
149
+ def post_process
150
+ # Check if there is any label inside reagents
151
+ # which is not assigned to any molecule
152
+ refine_reagents_label
153
+
154
+ # Label usually present a molecule, process those in reagents text
155
+ replace_label_by_molecule
156
+
157
+ # Text-as-label, e.g. "ligand = ", "amide = "
158
+ refine_text_label
159
+
160
+ refine_text_as_molecule
161
+
162
+ # From id => molecule
163
+ assemble_reaction
164
+
165
+ # - Extract reaction-related information: temperature, time, yield
166
+ # - Try interpret abbreviations
167
+ @reactions.each { |r| process_reaction_info(r) }
168
+
169
+ process_reactions_step
170
+ end
171
+ end
172
+ end
173
+ end