chem_scanner 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +13 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yml +604 -0
  5. data/.ruby-gemset +1 -0
  6. data/.ruby-version +1 -0
  7. data/.travis.yml +5 -0
  8. data/CODE_OF_CONDUCT.md +74 -0
  9. data/Gemfile +20 -0
  10. data/LICENSE.txt +661 -0
  11. data/README.md +177 -0
  12. data/Rakefile +8 -0
  13. data/bin/console +14 -0
  14. data/bin/setup +8 -0
  15. data/chem_scanner.gemspec +43 -0
  16. data/lib/chem_scanner.rb +79 -0
  17. data/lib/chem_scanner/cdx.rb +67 -0
  18. data/lib/chem_scanner/cdxml.rb +72 -0
  19. data/lib/chem_scanner/chem_draw/cdx_reader.rb +101 -0
  20. data/lib/chem_scanner/chem_draw/node/base_node.rb +123 -0
  21. data/lib/chem_scanner/chem_draw/node/base_value.rb +257 -0
  22. data/lib/chem_scanner/chem_draw/node/bond.rb +100 -0
  23. data/lib/chem_scanner/chem_draw/node/bracket_attachment.rb +17 -0
  24. data/lib/chem_scanner/chem_draw/node/bracket_group.rb +32 -0
  25. data/lib/chem_scanner/chem_draw/node/chem_geometry.rb +58 -0
  26. data/lib/chem_scanner/chem_draw/node/color_table.rb +46 -0
  27. data/lib/chem_scanner/chem_draw/node/font_table.rb +54 -0
  28. data/lib/chem_scanner/chem_draw/node/fragment.rb +149 -0
  29. data/lib/chem_scanner/chem_draw/node/fragment_node.rb +145 -0
  30. data/lib/chem_scanner/chem_draw/node/graphic.rb +94 -0
  31. data/lib/chem_scanner/chem_draw/node/text.rb +242 -0
  32. data/lib/chem_scanner/chem_draw/parser.rb +214 -0
  33. data/lib/chem_scanner/chem_draw/yaml/cdx_objects.yaml +32 -0
  34. data/lib/chem_scanner/chem_draw/yaml/cdx_props.yaml +263 -0
  35. data/lib/chem_scanner/chem_draw/yaml/cdxml_objects.yaml +36 -0
  36. data/lib/chem_scanner/chem_draw/yaml/cdxml_props.yaml +263 -0
  37. data/lib/chem_scanner/chem_draw/yaml/props_data_type.yaml +263 -0
  38. data/lib/chem_scanner/configuration/abbreviation.rb +76 -0
  39. data/lib/chem_scanner/configuration/superatom.rb +76 -0
  40. data/lib/chem_scanner/configuration/superatom.txt +2874 -0
  41. data/lib/chem_scanner/configuration/util.rb +40 -0
  42. data/lib/chem_scanner/configuration/yaml/abbreviations.yaml +6399 -0
  43. data/lib/chem_scanner/configuration/yaml/elements.yaml +115 -0
  44. data/lib/chem_scanner/configuration/yaml/solvents.yaml +16 -0
  45. data/lib/chem_scanner/doc.rb +56 -0
  46. data/lib/chem_scanner/docx.rb +86 -0
  47. data/lib/chem_scanner/export/cml.rb +176 -0
  48. data/lib/chem_scanner/extension/element_map.rb +9 -0
  49. data/lib/chem_scanner/extension/geometry/bounding_box.rb +84 -0
  50. data/lib/chem_scanner/extension/geometry/line.rb +123 -0
  51. data/lib/chem_scanner/extension/geometry/point.rb +18 -0
  52. data/lib/chem_scanner/extension/geometry/polygon.rb +115 -0
  53. data/lib/chem_scanner/extension/geometry/segment.rb +196 -0
  54. data/lib/chem_scanner/extension/passthrough.rb +7 -0
  55. data/lib/chem_scanner/interpreter/element/arrow.rb +298 -0
  56. data/lib/chem_scanner/interpreter/element/atom.rb +134 -0
  57. data/lib/chem_scanner/interpreter/element/fragment.rb +59 -0
  58. data/lib/chem_scanner/interpreter/element/molecule.rb +473 -0
  59. data/lib/chem_scanner/interpreter/element/molecule_group.rb +34 -0
  60. data/lib/chem_scanner/interpreter/element/reaction.rb +186 -0
  61. data/lib/chem_scanner/interpreter/element/reaction_step.rb +39 -0
  62. data/lib/chem_scanner/interpreter/formula_to_mol.rb +75 -0
  63. data/lib/chem_scanner/interpreter/post_process/assemble.rb +38 -0
  64. data/lib/chem_scanner/interpreter/post_process/label_by_molecule.rb +37 -0
  65. data/lib/chem_scanner/interpreter/post_process/reaction_info.rb +225 -0
  66. data/lib/chem_scanner/interpreter/post_process/reaction_step.rb +95 -0
  67. data/lib/chem_scanner/interpreter/post_process/reagent_label.rb +46 -0
  68. data/lib/chem_scanner/interpreter/post_process/text_as_molecule.rb +52 -0
  69. data/lib/chem_scanner/interpreter/post_process/text_label.rb +40 -0
  70. data/lib/chem_scanner/interpreter/pre_process/arrow.rb +197 -0
  71. data/lib/chem_scanner/interpreter/pre_process/graphic.rb +41 -0
  72. data/lib/chem_scanner/interpreter/pre_process/molecule.rb +150 -0
  73. data/lib/chem_scanner/interpreter/reaction_detection/assign_to_reaction.rb +129 -0
  74. data/lib/chem_scanner/interpreter/reaction_detection/duplicate_reagents.rb +50 -0
  75. data/lib/chem_scanner/interpreter/reaction_detection/molecule_group.rb +55 -0
  76. data/lib/chem_scanner/interpreter/reaction_detection/multi_line_chain_reaction.rb +85 -0
  77. data/lib/chem_scanner/interpreter/reaction_detection/remove_separated_mol.rb +115 -0
  78. data/lib/chem_scanner/interpreter/reaction_detection/text_assignment.rb +166 -0
  79. data/lib/chem_scanner/interpreter/scheme.rb +173 -0
  80. data/lib/chem_scanner/interpreter/scheme_base.rb +64 -0
  81. data/lib/chem_scanner/interpreter/text_group/bold_groups.rb +183 -0
  82. data/lib/chem_scanner/interpreter/text_group/molecule_text_group.rb +138 -0
  83. data/lib/chem_scanner/interpreter/text_group/reaction_text_groups.rb +221 -0
  84. data/lib/chem_scanner/interpreter/text_group/retrieve_alias_info.rb +41 -0
  85. data/lib/chem_scanner/interpreter/text_group/retrieve_n_atoms.rb +106 -0
  86. data/lib/chem_scanner/interpreter/text_group/text_group_interpreter.rb +92 -0
  87. data/lib/chem_scanner/perkin_eln.rb +287 -0
  88. data/lib/chem_scanner/version.rb +5 -0
  89. data/lib/rubygems_plugin.rb +5 -0
  90. metadata +244 -0
@@ -0,0 +1,95 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ module Interpreter
5
+ using Extension
6
+
7
+ module ReactionDetection
8
+ def process_reactions_step
9
+ @reactions.each { |r| detect_reaction_step(r) }
10
+ end
11
+
12
+ def detect_reaction_step(reaction)
13
+ number_ref = [
14
+ ["1", "2", "3", "4", "5", "6", "7", "8", "9"],
15
+ ["I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX"],
16
+ ["i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix"],
17
+ ["A", "B", "C", "D", "E", "F", "G", "H", "J"],
18
+ ]
19
+
20
+ regex_list = [
21
+ /(^|\A)(([1-9a-z]{0,3}) *[)\.] *(.*))($|\z)/i,
22
+ /(^|\A)\((([1-9a-z]{0,3}) *\) *(.*))($|\z)/i,
23
+ ]
24
+ check = false
25
+
26
+ list_matched = []
27
+ list_numbered = []
28
+ regex_list.each do |regex|
29
+ next if check
30
+
31
+ list_matched = reaction.description.enum_for(:scan, regex).map {
32
+ Regexp.last_match
33
+ }
34
+ list_numbered = list_matched.map { |x| x[3] }
35
+ next if list_numbered.empty?
36
+
37
+ number_ref.each do |ref|
38
+ check = true if ref & list_numbered == list_numbered
39
+ end
40
+ end
41
+
42
+ return unless check && list_numbered.count >= 2
43
+
44
+ flatten_ref = number_ref.flatten
45
+ check_temperature = false
46
+ check_time = false
47
+ list_position = list_matched.map { |x| x.begin(0) }
48
+
49
+ list_matched.each_with_index.map do |matched, idx|
50
+ next_pos = list_position[idx + 1] || -1
51
+ next_pos = next_pos.negative? ? next_pos : (next_pos - 1)
52
+ description = reaction.description[list_position[idx]..next_pos]
53
+ text_start_pos = if matched[4].empty?
54
+ m2 = matched[2]
55
+ description.index(m2) + m2.size
56
+ else
57
+ description.index(matched[4]) || 0
58
+ end
59
+ description = description[text_start_pos..-1]
60
+ temperature, _, time = extract_reaction_info([description])
61
+
62
+ step = ReactionStep.new
63
+ step.temperature = temperature
64
+ step.time = time
65
+ step.description = description
66
+ step.number = (flatten_ref.index(matched[3]) % 9) + 1
67
+
68
+ check_time = !time.empty?
69
+ check_temperature = !temperature.empty?
70
+
71
+ reaction.reagent_abbs.each do |abb|
72
+ next unless description.include?(abb)
73
+
74
+ step.reagents.push(ChemScanner.get_abbreviation(abb))
75
+ end
76
+
77
+ reaction.steps.push(step)
78
+ end
79
+
80
+ reaction.time = "" if check_time
81
+ reaction.temperature = "" if check_temperature
82
+
83
+ # NOTE: tempo tricky assign reagents to empty step
84
+ return if reaction.reagents.count != 1
85
+
86
+ empty_steps = reaction.steps.select do |s|
87
+ s.description.empty? || s.description == "\n"
88
+ end
89
+ return if empty_steps.count != 1
90
+
91
+ empty_steps.first.reagents.push(reaction.reagents.first.cano_smiles)
92
+ end
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,46 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ module Interpreter
5
+ using Extension
6
+
7
+ module PostProcess
8
+ def refine_reagents_label
9
+ @reactions.each do |r|
10
+ added_arr = []
11
+
12
+ @arrow_map[r.arrow_id].text_arr.each do |tid|
13
+ text = @text_map[tid]
14
+ bold = text.bold_text
15
+ next if bold.strip.empty?
16
+
17
+ mol_id = r.reagent_ids.detect { |id| @mol_map[id].label == bold }
18
+ next unless mol_id.nil?
19
+
20
+ min_dist = { key: 0, value: 9_999_999 }
21
+ r.reagent_ids.each do |rid|
22
+ reagent = @mol_map[rid]
23
+ dist = reagent.min_distance_to_point(text.polygon.center)
24
+ min_dist = { key: rid, value: dist } if dist < min_dist[:value]
25
+ end
26
+
27
+ if min_dist[:key].positive?
28
+ added_arr.push(text: tid, reagent: min_dist[:key])
29
+ end
30
+ end
31
+
32
+ added_arr.each do |added|
33
+ text = @text_map[added[:text]]
34
+ r.text_ids.delete(text.id)
35
+ @arrow_map[r.arrow_id].text_arr.delete(text.id)
36
+ reagent = @mol_map[added[:reagent]]
37
+ reagent.text_ids.push(text.id)
38
+ assemble_molecule_text(reagent)
39
+ # reagent.label = text.bold_text.strip
40
+ # text.remove_bold
41
+ end
42
+ end
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,52 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ module ChemScanner
5
+ # Interpreter of extracted/scanned information
6
+ module Interpreter
7
+ using Extension
8
+
9
+ module PostProcess
10
+ def refine_text_as_molecule
11
+ key_to_delete = []
12
+
13
+ @text_map.each do |k, text|
14
+ mol = @mol_map.values.detect { |m| m.text_ids.include?(k) }
15
+ next if mol.nil?
16
+
17
+ smi = ChemScanner.get_abbreviation(text.value)
18
+ next if smi.empty?
19
+
20
+ group_pos = {}
21
+ @reactions.each do |reaction|
22
+ rid = reaction.arrow_id
23
+ arrow = @arrow_map[rid]
24
+ group = detect_position(arrow, text.polygon)
25
+ next if group.nil?
26
+
27
+ group_pos[rid] = group
28
+ end
29
+
30
+ pos = group_pos.detect { |_, p| p == "reagents" }
31
+ next unless pos.nil?
32
+
33
+ pos = group_pos.detect { |_, p| %w[reactants products].include?(p) }
34
+ next if pos.nil?
35
+
36
+ puts "group: #{group_pos}"
37
+ key_to_delete.push(k)
38
+ mol.text_ids.delete(k)
39
+ @mol_map[k] = Molecule.new_from_smiles(k, smi)
40
+
41
+ pos = group_pos.first
42
+ reaction = @reactions.detect { |r| r.arrow_id == pos[0] }
43
+ group_ids = reaction.send("#{pos[1][0...-1]}_ids")
44
+ group_ids.push(k)
45
+ end
46
+
47
+ # Don't need to keep it text_map anymore
48
+ key_to_delete.each { |k| @text_map.delete(k) }
49
+ end
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,40 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ module ChemScanner
5
+ # Interpreter of extracted/scanned information
6
+ module Interpreter
7
+ using Extension
8
+
9
+ module PostProcess
10
+ # text_id could be both on text_map and mol_group_map
11
+ # Text-as-label, e.g. "ligand = ", "amide = "
12
+ def refine_text_label
13
+ @mol_map.select { |_, m| m.text.strip[-1] == "=" }.each do |mid, mol|
14
+ label_text = mol.text.strip.chomp("=").strip
15
+ existed = false
16
+
17
+ @reactions.each do |r|
18
+ @arrow_map[r.arrow_id].text_arr.each do |tid|
19
+ text = @text_map[tid]
20
+ next unless text.value.include?(label_text)
21
+
22
+ existed = true
23
+ end
24
+
25
+ r.reagent_ids.push(mid) unless r.reagent_ids.include?(mid)
26
+ end
27
+
28
+ next unless existed
29
+
30
+ @reactions.each do |r|
31
+ %w[reactant product].each do |group|
32
+ group_ids = r.send("#{group}_ids")
33
+ group_ids.delete(mid) if group_ids.include?(mid)
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,197 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ module Interpreter
5
+ using Extension
6
+
7
+ ESTIMATED_DIST = 0.2
8
+
9
+ module PreProcess
10
+ # - Detect cross arrow from line map
11
+ # - Attach "extend" line to arrow
12
+ def refine_arrow
13
+ detect_line_fragment
14
+
15
+ # Headless arrow ~ line, part of the real arrow
16
+ segment_keys = @geometry_map.select { |_, g| g.headless }.keys
17
+ segment_keys.each do |k|
18
+ segment = @geometry_map.delete(k)
19
+ tail = Geometry::Point.new(segment.tail[:x], segment.tail[:y])
20
+ head = Geometry::Point.new(segment.head[:x], segment.head[:y])
21
+
22
+ @segment_map[k] = Geometry::Segment.new(tail, head)
23
+ end
24
+
25
+ segment_keys = @graphic_map.select { |_, g| g.line? }.keys
26
+ segment_keys.each do |k|
27
+ segment = @graphic_map.delete(k)
28
+ tail = Geometry::Point.new(segment.tail[:x], segment.tail[:y])
29
+ head = Geometry::Point.new(segment.head[:x], segment.head[:y])
30
+
31
+ @segment_map[k] = Geometry::Segment.new(tail, head)
32
+ end
33
+
34
+ # |
35
+ # ---->|
36
+ # |
37
+ # V
38
+ arrow_graphic = @graphic_map.reject { |_, g| g.head.nil? || g.tail.nil? }
39
+ all_arrow = @geometry_map.merge(arrow_graphic)
40
+ all_arrow.each do |key, geometry|
41
+ arrow = Arrow.new(geometry)
42
+ @arrow_map[key] = arrow
43
+ aseg = geometry.segment
44
+ line = aseg.to_line
45
+
46
+ all_arrow.except(key).each do |_, other|
47
+ oseg = other.segment
48
+ next unless line.intersects_with_segment?(oseg)
49
+
50
+ point = line.intersection_points_with(oseg.to_line)
51
+ next unless oseg.contains_point?(point)
52
+
53
+ # |
54
+ # |
55
+ # ----|->
56
+ # |
57
+ # |
58
+ # v
59
+ # NOTE: due to manually drawing,
60
+ # the intersection point may not exactly the head of the arrow
61
+ next if Geometry.distance(arrow.head, point) > ESTIMATED_DIST
62
+
63
+ # If it intersect with any other geometry
64
+ arrow.change_head(other.head)
65
+ end
66
+ end
67
+
68
+ # \
69
+ # --\-->
70
+ # \
71
+ # Same effect as "nogo" attritbue
72
+ try_check_cross
73
+
74
+ # -----|
75
+ # |
76
+ # V
77
+ try_extend_tail
78
+
79
+ # |------>
80
+ # |
81
+ # -----|
82
+ # |
83
+ # |------>
84
+ try_extend_split
85
+ end
86
+
87
+ # - Check text within mol
88
+ # - Detect if there are any "arrow" molecule, ( straight C bonds: ----- )
89
+ # which people drawing to be viewed as an arrow
90
+ def detect_line_fragment
91
+ remove_keys = []
92
+
93
+ @fragment_map.each do |key, fragment|
94
+ # Check if user draw a molecule as an "extended" arrow
95
+ next unless fragment.line?
96
+
97
+ remove_keys.push(key)
98
+ @fragment_as_line += 1
99
+
100
+ nodes = fragment.node_map.values
101
+ is_vertical = nodes.map(&:y).uniq.count == 1
102
+ sorted_atoms = nodes.sort_by { |atom| is_vertical ? atom.y : atom.x }
103
+ segment = Geometry::Segment.new(sorted_atoms.first, sorted_atoms.last)
104
+
105
+ @segment_map[key] = segment
106
+ end
107
+
108
+ remove_keys.each { |k| @fragment_map.delete(k) }
109
+ end
110
+
111
+ # Try to extend base arrow if possible
112
+ def try_extend_tail
113
+ arrow_new_tail = {}
114
+ @segment_map.each do |key, seg|
115
+ @arrow_map.each_value do |arrow|
116
+ dist1 = Geometry.distance(seg.point1, arrow.tail)
117
+ dist2 = Geometry.distance(seg.point2, arrow.tail)
118
+ if dist1 <= dist2
119
+ dist = dist1
120
+ point = seg.point2
121
+ else
122
+ dist = dist2
123
+ point = seg.point1
124
+ end
125
+
126
+ next if dist > ESTIMATED_DIST
127
+
128
+ arrow_new_tail[arrow.id] = { skey: key, point: point }
129
+ end
130
+ end
131
+
132
+ arrow_new_tail.each do |aid, tail_info|
133
+ @segment_map.delete(tail_info[:skey])
134
+ arrow = @arrow_map[aid]
135
+ arrow.change_tail(tail_info[:point])
136
+ end
137
+ end
138
+
139
+ def try_extend_split
140
+ arrow_new_split = {}
141
+
142
+ @segment_map.each do |key, segment|
143
+ line = segment.to_line
144
+
145
+ @arrow_map.each_value do |arrow|
146
+ asegment = arrow.tail_segment
147
+ next unless line.intersects_with_segment?(asegment)
148
+
149
+ point = line.intersection_points_with(asegment.to_line)
150
+ dist1 = Geometry.distance(segment.point1, point)
151
+ dist2 = Geometry.distance(segment.point2, point)
152
+ next if [dist1, dist2].min > ESTIMATED_DIST
153
+
154
+ tail_point = dist1 < dist2 ? segment.point2 : segment.point1
155
+ arrow_new_split[arrow.id] = {
156
+ key: key,
157
+ point: point,
158
+ tpoint: tail_point,
159
+ }
160
+ end
161
+ end
162
+
163
+ arrow_new_split.each do |aid, split_info|
164
+ arrow = @arrow_map[aid]
165
+ arrow.update_tail(split_info[:point])
166
+ arrow.change_tail(split_info[:tpoint])
167
+
168
+ @segment_map.delete(split_info[:skey])
169
+ end
170
+ end
171
+
172
+ def try_check_cross
173
+ @arrow_map.each_value do |arrow|
174
+ next if arrow.cross
175
+
176
+ keys = []
177
+ @segment_map.each do |key, seg|
178
+ arrow.segments.each do |aseg|
179
+ next unless seg.intersects_with?(aseg)
180
+
181
+ pintersect = seg.intersection_point_with(aseg)
182
+ check = aseg.contains_point?(pintersect) \
183
+ && seg.point_in_range(pintersect, 3.0 / 5.0)
184
+ next unless check
185
+
186
+ # Add to the "polyline" of arrow
187
+ arrow.add_cross_segment(seg)
188
+ keys.push(key)
189
+ end
190
+ end
191
+
192
+ keys.each { |key| @segment_map.delete(key) }
193
+ end
194
+ end
195
+ end
196
+ end
197
+ end
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ module Interpreter
5
+ using Extension
6
+
7
+ module PreProcess
8
+ def find_fragment_inside_rectangle
9
+ # 3 = Rectangle
10
+ @graphic_map.select do |_, v|
11
+ v.type == 3 && v.bounding_box.area < 100
12
+ end.each do |_, graphic|
13
+ @fragment_map.each_value do |fragment|
14
+ next unless graphic.polygon.contains_polygon?(fragment.polygon)
15
+
16
+ fragment.boxed = true
17
+ end
18
+
19
+ @fragment_group_map.each do |_, fgroup|
20
+ fmap = fgroup[:fragment_map]
21
+ next unless fmap.values.count == 1
22
+
23
+ text = fgroup[:title]
24
+ next unless graphic.polygon.contains_polygon?(text.polygon)
25
+
26
+ fragment = fmap.values.first
27
+ fragment.boxed = true
28
+ end
29
+ end
30
+ end
31
+
32
+ def extract_fragment_graphic
33
+ @fragment_map.each_value do |fragment|
34
+ next if fragment.graphic_map.empty?
35
+
36
+ @graphic_map.merge!(fragment.graphic_map)
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end