chem_scanner 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (90) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +13 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yml +604 -0
  5. data/.ruby-gemset +1 -0
  6. data/.ruby-version +1 -0
  7. data/.travis.yml +5 -0
  8. data/CODE_OF_CONDUCT.md +74 -0
  9. data/Gemfile +20 -0
  10. data/LICENSE.txt +661 -0
  11. data/README.md +177 -0
  12. data/Rakefile +8 -0
  13. data/bin/console +14 -0
  14. data/bin/setup +8 -0
  15. data/chem_scanner.gemspec +43 -0
  16. data/lib/chem_scanner.rb +79 -0
  17. data/lib/chem_scanner/cdx.rb +67 -0
  18. data/lib/chem_scanner/cdxml.rb +72 -0
  19. data/lib/chem_scanner/chem_draw/cdx_reader.rb +101 -0
  20. data/lib/chem_scanner/chem_draw/node/base_node.rb +123 -0
  21. data/lib/chem_scanner/chem_draw/node/base_value.rb +257 -0
  22. data/lib/chem_scanner/chem_draw/node/bond.rb +100 -0
  23. data/lib/chem_scanner/chem_draw/node/bracket_attachment.rb +17 -0
  24. data/lib/chem_scanner/chem_draw/node/bracket_group.rb +32 -0
  25. data/lib/chem_scanner/chem_draw/node/chem_geometry.rb +58 -0
  26. data/lib/chem_scanner/chem_draw/node/color_table.rb +46 -0
  27. data/lib/chem_scanner/chem_draw/node/font_table.rb +54 -0
  28. data/lib/chem_scanner/chem_draw/node/fragment.rb +149 -0
  29. data/lib/chem_scanner/chem_draw/node/fragment_node.rb +145 -0
  30. data/lib/chem_scanner/chem_draw/node/graphic.rb +94 -0
  31. data/lib/chem_scanner/chem_draw/node/text.rb +242 -0
  32. data/lib/chem_scanner/chem_draw/parser.rb +214 -0
  33. data/lib/chem_scanner/chem_draw/yaml/cdx_objects.yaml +32 -0
  34. data/lib/chem_scanner/chem_draw/yaml/cdx_props.yaml +263 -0
  35. data/lib/chem_scanner/chem_draw/yaml/cdxml_objects.yaml +36 -0
  36. data/lib/chem_scanner/chem_draw/yaml/cdxml_props.yaml +263 -0
  37. data/lib/chem_scanner/chem_draw/yaml/props_data_type.yaml +263 -0
  38. data/lib/chem_scanner/configuration/abbreviation.rb +76 -0
  39. data/lib/chem_scanner/configuration/superatom.rb +76 -0
  40. data/lib/chem_scanner/configuration/superatom.txt +2874 -0
  41. data/lib/chem_scanner/configuration/util.rb +40 -0
  42. data/lib/chem_scanner/configuration/yaml/abbreviations.yaml +6399 -0
  43. data/lib/chem_scanner/configuration/yaml/elements.yaml +115 -0
  44. data/lib/chem_scanner/configuration/yaml/solvents.yaml +16 -0
  45. data/lib/chem_scanner/doc.rb +56 -0
  46. data/lib/chem_scanner/docx.rb +86 -0
  47. data/lib/chem_scanner/export/cml.rb +176 -0
  48. data/lib/chem_scanner/extension/element_map.rb +9 -0
  49. data/lib/chem_scanner/extension/geometry/bounding_box.rb +84 -0
  50. data/lib/chem_scanner/extension/geometry/line.rb +123 -0
  51. data/lib/chem_scanner/extension/geometry/point.rb +18 -0
  52. data/lib/chem_scanner/extension/geometry/polygon.rb +115 -0
  53. data/lib/chem_scanner/extension/geometry/segment.rb +196 -0
  54. data/lib/chem_scanner/extension/passthrough.rb +7 -0
  55. data/lib/chem_scanner/interpreter/element/arrow.rb +298 -0
  56. data/lib/chem_scanner/interpreter/element/atom.rb +134 -0
  57. data/lib/chem_scanner/interpreter/element/fragment.rb +59 -0
  58. data/lib/chem_scanner/interpreter/element/molecule.rb +473 -0
  59. data/lib/chem_scanner/interpreter/element/molecule_group.rb +34 -0
  60. data/lib/chem_scanner/interpreter/element/reaction.rb +186 -0
  61. data/lib/chem_scanner/interpreter/element/reaction_step.rb +39 -0
  62. data/lib/chem_scanner/interpreter/formula_to_mol.rb +75 -0
  63. data/lib/chem_scanner/interpreter/post_process/assemble.rb +38 -0
  64. data/lib/chem_scanner/interpreter/post_process/label_by_molecule.rb +37 -0
  65. data/lib/chem_scanner/interpreter/post_process/reaction_info.rb +225 -0
  66. data/lib/chem_scanner/interpreter/post_process/reaction_step.rb +95 -0
  67. data/lib/chem_scanner/interpreter/post_process/reagent_label.rb +46 -0
  68. data/lib/chem_scanner/interpreter/post_process/text_as_molecule.rb +52 -0
  69. data/lib/chem_scanner/interpreter/post_process/text_label.rb +40 -0
  70. data/lib/chem_scanner/interpreter/pre_process/arrow.rb +197 -0
  71. data/lib/chem_scanner/interpreter/pre_process/graphic.rb +41 -0
  72. data/lib/chem_scanner/interpreter/pre_process/molecule.rb +150 -0
  73. data/lib/chem_scanner/interpreter/reaction_detection/assign_to_reaction.rb +129 -0
  74. data/lib/chem_scanner/interpreter/reaction_detection/duplicate_reagents.rb +50 -0
  75. data/lib/chem_scanner/interpreter/reaction_detection/molecule_group.rb +55 -0
  76. data/lib/chem_scanner/interpreter/reaction_detection/multi_line_chain_reaction.rb +85 -0
  77. data/lib/chem_scanner/interpreter/reaction_detection/remove_separated_mol.rb +115 -0
  78. data/lib/chem_scanner/interpreter/reaction_detection/text_assignment.rb +166 -0
  79. data/lib/chem_scanner/interpreter/scheme.rb +173 -0
  80. data/lib/chem_scanner/interpreter/scheme_base.rb +64 -0
  81. data/lib/chem_scanner/interpreter/text_group/bold_groups.rb +183 -0
  82. data/lib/chem_scanner/interpreter/text_group/molecule_text_group.rb +138 -0
  83. data/lib/chem_scanner/interpreter/text_group/reaction_text_groups.rb +221 -0
  84. data/lib/chem_scanner/interpreter/text_group/retrieve_alias_info.rb +41 -0
  85. data/lib/chem_scanner/interpreter/text_group/retrieve_n_atoms.rb +106 -0
  86. data/lib/chem_scanner/interpreter/text_group/text_group_interpreter.rb +92 -0
  87. data/lib/chem_scanner/perkin_eln.rb +287 -0
  88. data/lib/chem_scanner/version.rb +5 -0
  89. data/lib/rubygems_plugin.rb +5 -0
  90. metadata +244 -0
@@ -0,0 +1,95 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ module Interpreter
5
+ using Extension
6
+
7
+ module ReactionDetection
8
+ def process_reactions_step
9
+ @reactions.each { |r| detect_reaction_step(r) }
10
+ end
11
+
12
+ def detect_reaction_step(reaction)
13
+ number_ref = [
14
+ ["1", "2", "3", "4", "5", "6", "7", "8", "9"],
15
+ ["I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX"],
16
+ ["i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix"],
17
+ ["A", "B", "C", "D", "E", "F", "G", "H", "J"],
18
+ ]
19
+
20
+ regex_list = [
21
+ /(^|\A)(([1-9a-z]{0,3}) *[)\.] *(.*))($|\z)/i,
22
+ /(^|\A)\((([1-9a-z]{0,3}) *\) *(.*))($|\z)/i,
23
+ ]
24
+ check = false
25
+
26
+ list_matched = []
27
+ list_numbered = []
28
+ regex_list.each do |regex|
29
+ next if check
30
+
31
+ list_matched = reaction.description.enum_for(:scan, regex).map {
32
+ Regexp.last_match
33
+ }
34
+ list_numbered = list_matched.map { |x| x[3] }
35
+ next if list_numbered.empty?
36
+
37
+ number_ref.each do |ref|
38
+ check = true if ref & list_numbered == list_numbered
39
+ end
40
+ end
41
+
42
+ return unless check && list_numbered.count >= 2
43
+
44
+ flatten_ref = number_ref.flatten
45
+ check_temperature = false
46
+ check_time = false
47
+ list_position = list_matched.map { |x| x.begin(0) }
48
+
49
+ list_matched.each_with_index.map do |matched, idx|
50
+ next_pos = list_position[idx + 1] || -1
51
+ next_pos = next_pos.negative? ? next_pos : (next_pos - 1)
52
+ description = reaction.description[list_position[idx]..next_pos]
53
+ text_start_pos = if matched[4].empty?
54
+ m2 = matched[2]
55
+ description.index(m2) + m2.size
56
+ else
57
+ description.index(matched[4]) || 0
58
+ end
59
+ description = description[text_start_pos..-1]
60
+ temperature, _, time = extract_reaction_info([description])
61
+
62
+ step = ReactionStep.new
63
+ step.temperature = temperature
64
+ step.time = time
65
+ step.description = description
66
+ step.number = (flatten_ref.index(matched[3]) % 9) + 1
67
+
68
+ check_time = !time.empty?
69
+ check_temperature = !temperature.empty?
70
+
71
+ reaction.reagent_abbs.each do |abb|
72
+ next unless description.include?(abb)
73
+
74
+ step.reagents.push(ChemScanner.get_abbreviation(abb))
75
+ end
76
+
77
+ reaction.steps.push(step)
78
+ end
79
+
80
+ reaction.time = "" if check_time
81
+ reaction.temperature = "" if check_temperature
82
+
83
+ # NOTE: tempo tricky assign reagents to empty step
84
+ return if reaction.reagents.count != 1
85
+
86
+ empty_steps = reaction.steps.select do |s|
87
+ s.description.empty? || s.description == "\n"
88
+ end
89
+ return if empty_steps.count != 1
90
+
91
+ empty_steps.first.reagents.push(reaction.reagents.first.cano_smiles)
92
+ end
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,46 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ module Interpreter
5
+ using Extension
6
+
7
+ module PostProcess
8
+ def refine_reagents_label
9
+ @reactions.each do |r|
10
+ added_arr = []
11
+
12
+ @arrow_map[r.arrow_id].text_arr.each do |tid|
13
+ text = @text_map[tid]
14
+ bold = text.bold_text
15
+ next if bold.strip.empty?
16
+
17
+ mol_id = r.reagent_ids.detect { |id| @mol_map[id].label == bold }
18
+ next unless mol_id.nil?
19
+
20
+ min_dist = { key: 0, value: 9_999_999 }
21
+ r.reagent_ids.each do |rid|
22
+ reagent = @mol_map[rid]
23
+ dist = reagent.min_distance_to_point(text.polygon.center)
24
+ min_dist = { key: rid, value: dist } if dist < min_dist[:value]
25
+ end
26
+
27
+ if min_dist[:key].positive?
28
+ added_arr.push(text: tid, reagent: min_dist[:key])
29
+ end
30
+ end
31
+
32
+ added_arr.each do |added|
33
+ text = @text_map[added[:text]]
34
+ r.text_ids.delete(text.id)
35
+ @arrow_map[r.arrow_id].text_arr.delete(text.id)
36
+ reagent = @mol_map[added[:reagent]]
37
+ reagent.text_ids.push(text.id)
38
+ assemble_molecule_text(reagent)
39
+ # reagent.label = text.bold_text.strip
40
+ # text.remove_bold
41
+ end
42
+ end
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,52 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ module ChemScanner
5
+ # Interpreter of extracted/scanned information
6
+ module Interpreter
7
+ using Extension
8
+
9
+ module PostProcess
10
+ def refine_text_as_molecule
11
+ key_to_delete = []
12
+
13
+ @text_map.each do |k, text|
14
+ mol = @mol_map.values.detect { |m| m.text_ids.include?(k) }
15
+ next if mol.nil?
16
+
17
+ smi = ChemScanner.get_abbreviation(text.value)
18
+ next if smi.empty?
19
+
20
+ group_pos = {}
21
+ @reactions.each do |reaction|
22
+ rid = reaction.arrow_id
23
+ arrow = @arrow_map[rid]
24
+ group = detect_position(arrow, text.polygon)
25
+ next if group.nil?
26
+
27
+ group_pos[rid] = group
28
+ end
29
+
30
+ pos = group_pos.detect { |_, p| p == "reagents" }
31
+ next unless pos.nil?
32
+
33
+ pos = group_pos.detect { |_, p| %w[reactants products].include?(p) }
34
+ next if pos.nil?
35
+
36
+ puts "group: #{group_pos}"
37
+ key_to_delete.push(k)
38
+ mol.text_ids.delete(k)
39
+ @mol_map[k] = Molecule.new_from_smiles(k, smi)
40
+
41
+ pos = group_pos.first
42
+ reaction = @reactions.detect { |r| r.arrow_id == pos[0] }
43
+ group_ids = reaction.send("#{pos[1][0...-1]}_ids")
44
+ group_ids.push(k)
45
+ end
46
+
47
+ # Don't need to keep it text_map anymore
48
+ key_to_delete.each { |k| @text_map.delete(k) }
49
+ end
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,40 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ module ChemScanner
5
+ # Interpreter of extracted/scanned information
6
+ module Interpreter
7
+ using Extension
8
+
9
+ module PostProcess
10
+ # text_id could be both on text_map and mol_group_map
11
+ # Text-as-label, e.g. "ligand = ", "amide = "
12
+ def refine_text_label
13
+ @mol_map.select { |_, m| m.text.strip[-1] == "=" }.each do |mid, mol|
14
+ label_text = mol.text.strip.chomp("=").strip
15
+ existed = false
16
+
17
+ @reactions.each do |r|
18
+ @arrow_map[r.arrow_id].text_arr.each do |tid|
19
+ text = @text_map[tid]
20
+ next unless text.value.include?(label_text)
21
+
22
+ existed = true
23
+ end
24
+
25
+ r.reagent_ids.push(mid) unless r.reagent_ids.include?(mid)
26
+ end
27
+
28
+ next unless existed
29
+
30
+ @reactions.each do |r|
31
+ %w[reactant product].each do |group|
32
+ group_ids = r.send("#{group}_ids")
33
+ group_ids.delete(mid) if group_ids.include?(mid)
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,197 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ module Interpreter
5
+ using Extension
6
+
7
+ ESTIMATED_DIST = 0.2
8
+
9
+ module PreProcess
10
+ # - Detect cross arrow from line map
11
+ # - Attach "extend" line to arrow
12
+ def refine_arrow
13
+ detect_line_fragment
14
+
15
+ # Headless arrow ~ line, part of the real arrow
16
+ segment_keys = @geometry_map.select { |_, g| g.headless }.keys
17
+ segment_keys.each do |k|
18
+ segment = @geometry_map.delete(k)
19
+ tail = Geometry::Point.new(segment.tail[:x], segment.tail[:y])
20
+ head = Geometry::Point.new(segment.head[:x], segment.head[:y])
21
+
22
+ @segment_map[k] = Geometry::Segment.new(tail, head)
23
+ end
24
+
25
+ segment_keys = @graphic_map.select { |_, g| g.line? }.keys
26
+ segment_keys.each do |k|
27
+ segment = @graphic_map.delete(k)
28
+ tail = Geometry::Point.new(segment.tail[:x], segment.tail[:y])
29
+ head = Geometry::Point.new(segment.head[:x], segment.head[:y])
30
+
31
+ @segment_map[k] = Geometry::Segment.new(tail, head)
32
+ end
33
+
34
+ # |
35
+ # ---->|
36
+ # |
37
+ # V
38
+ arrow_graphic = @graphic_map.reject { |_, g| g.head.nil? || g.tail.nil? }
39
+ all_arrow = @geometry_map.merge(arrow_graphic)
40
+ all_arrow.each do |key, geometry|
41
+ arrow = Arrow.new(geometry)
42
+ @arrow_map[key] = arrow
43
+ aseg = geometry.segment
44
+ line = aseg.to_line
45
+
46
+ all_arrow.except(key).each do |_, other|
47
+ oseg = other.segment
48
+ next unless line.intersects_with_segment?(oseg)
49
+
50
+ point = line.intersection_points_with(oseg.to_line)
51
+ next unless oseg.contains_point?(point)
52
+
53
+ # |
54
+ # |
55
+ # ----|->
56
+ # |
57
+ # |
58
+ # v
59
+ # NOTE: due to manually drawing,
60
+ # the intersection point may not exactly the head of the arrow
61
+ next if Geometry.distance(arrow.head, point) > ESTIMATED_DIST
62
+
63
+ # If it intersect with any other geometry
64
+ arrow.change_head(other.head)
65
+ end
66
+ end
67
+
68
+ # \
69
+ # --\-->
70
+ # \
71
+ # Same effect as "nogo" attritbue
72
+ try_check_cross
73
+
74
+ # -----|
75
+ # |
76
+ # V
77
+ try_extend_tail
78
+
79
+ # |------>
80
+ # |
81
+ # -----|
82
+ # |
83
+ # |------>
84
+ try_extend_split
85
+ end
86
+
87
+ # - Check text within mol
88
+ # - Detect if there are any "arrow" molecule, ( straight C bonds: ----- )
89
+ # which people drawing to be viewed as an arrow
90
+ def detect_line_fragment
91
+ remove_keys = []
92
+
93
+ @fragment_map.each do |key, fragment|
94
+ # Check if user draw a molecule as an "extended" arrow
95
+ next unless fragment.line?
96
+
97
+ remove_keys.push(key)
98
+ @fragment_as_line += 1
99
+
100
+ nodes = fragment.node_map.values
101
+ is_vertical = nodes.map(&:y).uniq.count == 1
102
+ sorted_atoms = nodes.sort_by { |atom| is_vertical ? atom.y : atom.x }
103
+ segment = Geometry::Segment.new(sorted_atoms.first, sorted_atoms.last)
104
+
105
+ @segment_map[key] = segment
106
+ end
107
+
108
+ remove_keys.each { |k| @fragment_map.delete(k) }
109
+ end
110
+
111
+ # Try to extend base arrow if possible
112
+ def try_extend_tail
113
+ arrow_new_tail = {}
114
+ @segment_map.each do |key, seg|
115
+ @arrow_map.each_value do |arrow|
116
+ dist1 = Geometry.distance(seg.point1, arrow.tail)
117
+ dist2 = Geometry.distance(seg.point2, arrow.tail)
118
+ if dist1 <= dist2
119
+ dist = dist1
120
+ point = seg.point2
121
+ else
122
+ dist = dist2
123
+ point = seg.point1
124
+ end
125
+
126
+ next if dist > ESTIMATED_DIST
127
+
128
+ arrow_new_tail[arrow.id] = { skey: key, point: point }
129
+ end
130
+ end
131
+
132
+ arrow_new_tail.each do |aid, tail_info|
133
+ @segment_map.delete(tail_info[:skey])
134
+ arrow = @arrow_map[aid]
135
+ arrow.change_tail(tail_info[:point])
136
+ end
137
+ end
138
+
139
+ def try_extend_split
140
+ arrow_new_split = {}
141
+
142
+ @segment_map.each do |key, segment|
143
+ line = segment.to_line
144
+
145
+ @arrow_map.each_value do |arrow|
146
+ asegment = arrow.tail_segment
147
+ next unless line.intersects_with_segment?(asegment)
148
+
149
+ point = line.intersection_points_with(asegment.to_line)
150
+ dist1 = Geometry.distance(segment.point1, point)
151
+ dist2 = Geometry.distance(segment.point2, point)
152
+ next if [dist1, dist2].min > ESTIMATED_DIST
153
+
154
+ tail_point = dist1 < dist2 ? segment.point2 : segment.point1
155
+ arrow_new_split[arrow.id] = {
156
+ key: key,
157
+ point: point,
158
+ tpoint: tail_point,
159
+ }
160
+ end
161
+ end
162
+
163
+ arrow_new_split.each do |aid, split_info|
164
+ arrow = @arrow_map[aid]
165
+ arrow.update_tail(split_info[:point])
166
+ arrow.change_tail(split_info[:tpoint])
167
+
168
+ @segment_map.delete(split_info[:skey])
169
+ end
170
+ end
171
+
172
+ def try_check_cross
173
+ @arrow_map.each_value do |arrow|
174
+ next if arrow.cross
175
+
176
+ keys = []
177
+ @segment_map.each do |key, seg|
178
+ arrow.segments.each do |aseg|
179
+ next unless seg.intersects_with?(aseg)
180
+
181
+ pintersect = seg.intersection_point_with(aseg)
182
+ check = aseg.contains_point?(pintersect) \
183
+ && seg.point_in_range(pintersect, 3.0 / 5.0)
184
+ next unless check
185
+
186
+ # Add to the "polyline" of arrow
187
+ arrow.add_cross_segment(seg)
188
+ keys.push(key)
189
+ end
190
+ end
191
+
192
+ keys.each { |key| @segment_map.delete(key) }
193
+ end
194
+ end
195
+ end
196
+ end
197
+ end
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ module Interpreter
5
+ using Extension
6
+
7
+ module PreProcess
8
+ def find_fragment_inside_rectangle
9
+ # 3 = Rectangle
10
+ @graphic_map.select do |_, v|
11
+ v.type == 3 && v.bounding_box.area < 100
12
+ end.each do |_, graphic|
13
+ @fragment_map.each_value do |fragment|
14
+ next unless graphic.polygon.contains_polygon?(fragment.polygon)
15
+
16
+ fragment.boxed = true
17
+ end
18
+
19
+ @fragment_group_map.each do |_, fgroup|
20
+ fmap = fgroup[:fragment_map]
21
+ next unless fmap.values.count == 1
22
+
23
+ text = fgroup[:title]
24
+ next unless graphic.polygon.contains_polygon?(text.polygon)
25
+
26
+ fragment = fmap.values.first
27
+ fragment.boxed = true
28
+ end
29
+ end
30
+ end
31
+
32
+ def extract_fragment_graphic
33
+ @fragment_map.each_value do |fragment|
34
+ next if fragment.graphic_map.empty?
35
+
36
+ @graphic_map.merge!(fragment.graphic_map)
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end