chem_scanner 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (90) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +13 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yml +604 -0
  5. data/.ruby-gemset +1 -0
  6. data/.ruby-version +1 -0
  7. data/.travis.yml +5 -0
  8. data/CODE_OF_CONDUCT.md +74 -0
  9. data/Gemfile +20 -0
  10. data/LICENSE.txt +661 -0
  11. data/README.md +177 -0
  12. data/Rakefile +8 -0
  13. data/bin/console +14 -0
  14. data/bin/setup +8 -0
  15. data/chem_scanner.gemspec +43 -0
  16. data/lib/chem_scanner.rb +79 -0
  17. data/lib/chem_scanner/cdx.rb +67 -0
  18. data/lib/chem_scanner/cdxml.rb +72 -0
  19. data/lib/chem_scanner/chem_draw/cdx_reader.rb +101 -0
  20. data/lib/chem_scanner/chem_draw/node/base_node.rb +123 -0
  21. data/lib/chem_scanner/chem_draw/node/base_value.rb +257 -0
  22. data/lib/chem_scanner/chem_draw/node/bond.rb +100 -0
  23. data/lib/chem_scanner/chem_draw/node/bracket_attachment.rb +17 -0
  24. data/lib/chem_scanner/chem_draw/node/bracket_group.rb +32 -0
  25. data/lib/chem_scanner/chem_draw/node/chem_geometry.rb +58 -0
  26. data/lib/chem_scanner/chem_draw/node/color_table.rb +46 -0
  27. data/lib/chem_scanner/chem_draw/node/font_table.rb +54 -0
  28. data/lib/chem_scanner/chem_draw/node/fragment.rb +149 -0
  29. data/lib/chem_scanner/chem_draw/node/fragment_node.rb +145 -0
  30. data/lib/chem_scanner/chem_draw/node/graphic.rb +94 -0
  31. data/lib/chem_scanner/chem_draw/node/text.rb +242 -0
  32. data/lib/chem_scanner/chem_draw/parser.rb +214 -0
  33. data/lib/chem_scanner/chem_draw/yaml/cdx_objects.yaml +32 -0
  34. data/lib/chem_scanner/chem_draw/yaml/cdx_props.yaml +263 -0
  35. data/lib/chem_scanner/chem_draw/yaml/cdxml_objects.yaml +36 -0
  36. data/lib/chem_scanner/chem_draw/yaml/cdxml_props.yaml +263 -0
  37. data/lib/chem_scanner/chem_draw/yaml/props_data_type.yaml +263 -0
  38. data/lib/chem_scanner/configuration/abbreviation.rb +76 -0
  39. data/lib/chem_scanner/configuration/superatom.rb +76 -0
  40. data/lib/chem_scanner/configuration/superatom.txt +2874 -0
  41. data/lib/chem_scanner/configuration/util.rb +40 -0
  42. data/lib/chem_scanner/configuration/yaml/abbreviations.yaml +6399 -0
  43. data/lib/chem_scanner/configuration/yaml/elements.yaml +115 -0
  44. data/lib/chem_scanner/configuration/yaml/solvents.yaml +16 -0
  45. data/lib/chem_scanner/doc.rb +56 -0
  46. data/lib/chem_scanner/docx.rb +86 -0
  47. data/lib/chem_scanner/export/cml.rb +176 -0
  48. data/lib/chem_scanner/extension/element_map.rb +9 -0
  49. data/lib/chem_scanner/extension/geometry/bounding_box.rb +84 -0
  50. data/lib/chem_scanner/extension/geometry/line.rb +123 -0
  51. data/lib/chem_scanner/extension/geometry/point.rb +18 -0
  52. data/lib/chem_scanner/extension/geometry/polygon.rb +115 -0
  53. data/lib/chem_scanner/extension/geometry/segment.rb +196 -0
  54. data/lib/chem_scanner/extension/passthrough.rb +7 -0
  55. data/lib/chem_scanner/interpreter/element/arrow.rb +298 -0
  56. data/lib/chem_scanner/interpreter/element/atom.rb +134 -0
  57. data/lib/chem_scanner/interpreter/element/fragment.rb +59 -0
  58. data/lib/chem_scanner/interpreter/element/molecule.rb +473 -0
  59. data/lib/chem_scanner/interpreter/element/molecule_group.rb +34 -0
  60. data/lib/chem_scanner/interpreter/element/reaction.rb +186 -0
  61. data/lib/chem_scanner/interpreter/element/reaction_step.rb +39 -0
  62. data/lib/chem_scanner/interpreter/formula_to_mol.rb +75 -0
  63. data/lib/chem_scanner/interpreter/post_process/assemble.rb +38 -0
  64. data/lib/chem_scanner/interpreter/post_process/label_by_molecule.rb +37 -0
  65. data/lib/chem_scanner/interpreter/post_process/reaction_info.rb +225 -0
  66. data/lib/chem_scanner/interpreter/post_process/reaction_step.rb +95 -0
  67. data/lib/chem_scanner/interpreter/post_process/reagent_label.rb +46 -0
  68. data/lib/chem_scanner/interpreter/post_process/text_as_molecule.rb +52 -0
  69. data/lib/chem_scanner/interpreter/post_process/text_label.rb +40 -0
  70. data/lib/chem_scanner/interpreter/pre_process/arrow.rb +197 -0
  71. data/lib/chem_scanner/interpreter/pre_process/graphic.rb +41 -0
  72. data/lib/chem_scanner/interpreter/pre_process/molecule.rb +150 -0
  73. data/lib/chem_scanner/interpreter/reaction_detection/assign_to_reaction.rb +129 -0
  74. data/lib/chem_scanner/interpreter/reaction_detection/duplicate_reagents.rb +50 -0
  75. data/lib/chem_scanner/interpreter/reaction_detection/molecule_group.rb +55 -0
  76. data/lib/chem_scanner/interpreter/reaction_detection/multi_line_chain_reaction.rb +85 -0
  77. data/lib/chem_scanner/interpreter/reaction_detection/remove_separated_mol.rb +115 -0
  78. data/lib/chem_scanner/interpreter/reaction_detection/text_assignment.rb +166 -0
  79. data/lib/chem_scanner/interpreter/scheme.rb +173 -0
  80. data/lib/chem_scanner/interpreter/scheme_base.rb +64 -0
  81. data/lib/chem_scanner/interpreter/text_group/bold_groups.rb +183 -0
  82. data/lib/chem_scanner/interpreter/text_group/molecule_text_group.rb +138 -0
  83. data/lib/chem_scanner/interpreter/text_group/reaction_text_groups.rb +221 -0
  84. data/lib/chem_scanner/interpreter/text_group/retrieve_alias_info.rb +41 -0
  85. data/lib/chem_scanner/interpreter/text_group/retrieve_n_atoms.rb +106 -0
  86. data/lib/chem_scanner/interpreter/text_group/text_group_interpreter.rb +92 -0
  87. data/lib/chem_scanner/perkin_eln.rb +287 -0
  88. data/lib/chem_scanner/version.rb +5 -0
  89. data/lib/rubygems_plugin.rb +5 -0
  90. metadata +244 -0
@@ -0,0 +1,145 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ module ChemDraw
5
+ ALIAS_VALUES = [0, 4, 5, 8, 12].freeze
6
+
7
+ # CDX Node parser
8
+ class FragmentNode < BaseNode
9
+ require "chem_scanner/chem_draw/node/fragment"
10
+ require "chem_scanner/chem_draw/node/text"
11
+
12
+ attr_accessor :num_hydrogens, :atnum, :spin, :charge, :iso, :type,
13
+ :ext_type, :x, :y, :is_alias, :alias_text,
14
+ :warning, :warning_data, :fragment, :nested_fragment,
15
+ :nested_text, :color, :expanded, :point, :is_polymer
16
+
17
+ def initialize(parser, parser_type, id)
18
+ super(parser, parser_type, id)
19
+
20
+ @num_hydrogens = -1
21
+ @atnum = -1
22
+ @spin = 0
23
+ @charge = 0
24
+ @iso = 0
25
+ @color = 0
26
+ @type = -1
27
+ @ext_type = -1
28
+ @alias_text = ""
29
+ @warning = false
30
+ @warning_data = ""
31
+ @is_alias = false
32
+
33
+ @nested_fragment = {}
34
+ @nested_text = {}
35
+ @expanded = false
36
+ @is_polymer = false
37
+ end
38
+
39
+ # rubocop:disable Methods/PerceivedComplexity
40
+ def parse_node(tag, nid, data)
41
+ ref = @props_ref[tag]
42
+ ref = ref.nil? ? @obj_ref[tag] : ref
43
+
44
+ case ref
45
+ when "Node_Element" then @atnum = read_value(tag, data)
46
+ when "Atom_Radical" then @spin = read_value(tag, data)
47
+ when "Atom_Isotope" then @iso = read_value(tag, data)
48
+ when "Fragment"
49
+ frag = Fragment.new(@parser, @parser_type, nid)
50
+ frag.read
51
+ @nested_fragment[nid] = frag
52
+ when "Atom_GenericNickname"
53
+ nickname = send("#{@parser_type}_text", data)
54
+ @generic_nickname = nickname.first[:text] unless nickname.empty?
55
+ when "Node_Type"
56
+ @type = read_type(tag, data, CDXML_NODE_TYPE)
57
+ @is_alias = ALIAS_VALUES.include?(@type)
58
+ when "2DPosition" then @x, @y = read_value(tag, data)
59
+ when "Atom_Charge" then @charge = read_value(tag, data)
60
+ when "Text"
61
+ @text = Text.new(@parser, @parser_type, nid, true)
62
+ # NOTE: MUST read first in order to maintain CDX reader
63
+ @text.read
64
+ @polygon = @text.polygon
65
+
66
+ @nested_text[@text.id] = @text
67
+ when "ChemicalWarning"
68
+ @warning = true
69
+ @warning_data = @parser_type == "cdxml" ? data.text : data
70
+ when "Atom_NumHydrogens" then @num_hydrogens = read_value(tag, data)
71
+ when "ForegroundColor" then @color = read_value(tag, data)
72
+ when "Atom_ExternalConnectionType"
73
+ @ext_type = read_type(tag, data, CDXML_ATOM_EXTERNAL_CONNECTION_TYPE)
74
+ else do_unhandled(tag)
75
+ end
76
+ end
77
+ # rubocop:enable Methods/PerceivedComplexity
78
+
79
+ def post_parse_node
80
+ @point = Geometry::Point.new(@x, @y)
81
+
82
+ if !@text.nil? && !@text.value.empty?
83
+ @alias_text = @text.value
84
+ return
85
+ end
86
+
87
+ interpreter = ChemScanner::Interpreter
88
+ if !@generic_nickname.nil? &&
89
+ interpreter.rgroup_atom?(@generic_nickname)
90
+ @is_alias = true
91
+ @type = 7
92
+ @alias_text = @generic_nickname
93
+ end
94
+ end
95
+
96
+ def leftbottom
97
+ @polygon.nil? ? point : @polygon.bounding_box.leftbottom
98
+ end
99
+
100
+ def righttop
101
+ @polygon.nil? ? point : @polygon.bounding_box.righttop
102
+ end
103
+
104
+ def has_nil_coord?
105
+ (@x.nil? || @y.nil?) && @polygon.nil?
106
+ end
107
+
108
+ def set_type(type)
109
+ @type = type
110
+ end
111
+
112
+ def set_expanded
113
+ @expanded = true
114
+ end
115
+
116
+ def set_is_polymer
117
+ @is_alias = true
118
+ @is_polymer = true
119
+ end
120
+
121
+ def clone
122
+ cloned = self.class.new(@parser, @parser_type, nil)
123
+ cloned.num_hydrogens = @num_hydrogens
124
+ cloned.atnum = @atnum
125
+ cloned.spin = @spin
126
+ cloned.charge = @charge
127
+ cloned.iso = @iso
128
+ cloned.color = @color
129
+ cloned.type = @type
130
+ cloned.alias_text = @alias_text
131
+ cloned.warning = @warning
132
+ cloned.warning_data = @warning_data
133
+ cloned.is_alias = @is_alias
134
+ cloned.expanded = @expanded
135
+
136
+ cloned.nested_fragment = {}
137
+ cloned.nested_text = {}
138
+ @nested_fragment.each { |k, v| cloned.nested_fragment[k] = v }
139
+ @nested_text.each { |k, v| cloned.nested_text[k] = v }
140
+
141
+ cloned
142
+ end
143
+ end
144
+ end
145
+ end
@@ -0,0 +1,94 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ module ChemDraw
5
+ # CDX Graphic parser
6
+ class Graphic < BaseNode
7
+ attr_reader :arrow_id, :type, :arrow_head, :head, :tail,
8
+ :line_type, :orbital_type, :oval_type, :polygon
9
+
10
+ GRAPHIC_BRACKET_TYPE = 6
11
+
12
+ def initialize(parser, parser_type, id)
13
+ super(parser, parser_type, id)
14
+
15
+ @line_type = 0
16
+ end
17
+
18
+ def parse_node(tag, _id, data)
19
+ case @props_ref[tag]
20
+ when "Arrow_Type"
21
+ @arrow_head = read_type(tag, data, CDXML_ARROW_TYPE)
22
+ when "Line_Type"
23
+ @line_type = read_type(tag, data, CDXML_LINE_TYPE)
24
+ when "Graphic_Type"
25
+ @type = read_type(tag, data, CDXML_GRAPHIC_TYPE)
26
+ # Graphic objects are the only objects whose kCDXProp_BoundingBox
27
+ # property has a special meaning, representing a pair of points
28
+ # rather than a rectangle.
29
+ when "BoundingBox" then @polygon = read_value(tag, data)
30
+ when "SupersededBy" then @arrow_id = read_value(tag, data)
31
+ when "3DMajorAxisEnd" then @right, @top = read_value(tag, data)
32
+ when "3DMinorAxisEnd" then @left, @bottom = read_value(tag, data)
33
+ when "Orbital_Type"
34
+ @orbital_type = read_type(tag, data, CDXML_ORBITAL_TYPE)
35
+ when "Oval_Type"
36
+ @oval_type = read_type(tag, data, CDXML_OVAL_TYPE)
37
+ else do_unhandled(tag)
38
+ end
39
+ end
40
+
41
+ def post_parse_node
42
+ # When dealing with orbital, boundingbox is not reliable
43
+ build_orbital_polygon if @type == 5
44
+
45
+ # In case of Graphic is arrow
46
+ # Treat as arrow if is a line, no "SupersededBy" and has "BoundingBox"
47
+ return unless @type == 1 && @arrow_id.nil? && !@polygon.nil?
48
+
49
+ vertices = @polygon.vertices
50
+ # start point ~ head
51
+ sp = vertices[1]
52
+ # end point ~ tail
53
+ ep = vertices[3]
54
+
55
+ @head = { x: sp.x, y: sp.y }
56
+ @tail = { x: ep.x, y: ep.y }
57
+ end
58
+
59
+ def build_orbital_polygon
60
+ return unless @orbital_type == 256 && @oval_type == 3
61
+
62
+ p1 = Geometry::Point.new(@left, @bottom)
63
+ p2 = Geometry::Point.new(@left, @top)
64
+ p3 = Geometry::Point.new(@right, @top)
65
+ p4 = Geometry::Point.new(@right, @bottom)
66
+
67
+ @polygon = Geometry::Polygon.new([p1, p2, p3, p4])
68
+ end
69
+
70
+ def line?
71
+ @type == 1 && @arrow_id.nil? && (@arrow_head.nil? || @arrow_head.zero?)
72
+ end
73
+
74
+ def segment
75
+ Geometry::Segment.new_by_arrays(
76
+ [@tail[:x], @tail[:y]],
77
+ [@head[:x], @head[:y]],
78
+ )
79
+ end
80
+
81
+ def vector
82
+ segment.to_vector
83
+ end
84
+
85
+ def line
86
+ segment.to_line
87
+ end
88
+
89
+ def cross?
90
+ false
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,242 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ module ChemScanner
5
+ module ChemDraw
6
+ # Text parser
7
+ class Text < BaseNode
8
+ attr_accessor :warning, :warning_data, :x, :y, :styled_text, :value,
9
+ :center, :polygon, :bold_text, :non_bold_text
10
+
11
+ GREEK_CHARS = {
12
+ "A" => "Α",
13
+ "a" => "α",
14
+ "B" => "Β",
15
+ "b" => "β",
16
+ "G" => "Γ",
17
+ "g" => "γ",
18
+ "D" => "Δ",
19
+ "d" => "δ",
20
+ "E" => "Ε",
21
+ "e" => "ε",
22
+ "Z" => "Ζ",
23
+ "z" => "ζ",
24
+ "H" => "Η",
25
+ "h" => "η",
26
+ "Q" => "Θ",
27
+ "q" => "θ",
28
+ "I" => "Ι",
29
+ "i" => "ι",
30
+ "K" => "Κ",
31
+ "k" => "κ",
32
+ "L" => "Λ",
33
+ "l" => "λ",
34
+ "M" => "Μ",
35
+ "m" => "μ",
36
+ "N" => "Ν",
37
+ "n" => "ν",
38
+ "C" => "Ξ",
39
+ "c" => "ξ",
40
+ "O" => "Ο",
41
+ "o" => "ο",
42
+ "P" => "Π",
43
+ "p" => "π",
44
+ "R" => "Ρ",
45
+ "r" => "ρ",
46
+ "S" => "Σ",
47
+ "s" => "σ",
48
+ "T" => "Τ",
49
+ "t" => "τ",
50
+ "U" => "Υ",
51
+ "u" => "υ",
52
+ "F" => "Φ",
53
+ "f" => "φ",
54
+ "X" => "Χ",
55
+ "x" => "χ",
56
+ "Y" => "Ψ",
57
+ "y" => "ψ",
58
+ "W" => "Ω",
59
+ "w" => "ω",
60
+ }.freeze
61
+
62
+ BOLD_VAL = 0x01
63
+ FONT_KEY = "face"
64
+ COLOR_KEY = "color"
65
+
66
+ def initialize(parser, parser_type, id, is_alias = false)
67
+ super(parser, parser_type, id)
68
+
69
+ @warning = false
70
+ @is_alias = is_alias
71
+
72
+ @bold_text = ""
73
+ @value = ""
74
+ end
75
+
76
+ def parse_node(tag, _id, data)
77
+ # NOTE: CDXML text does not have tag
78
+ # "Text" below only happens for CDX
79
+ case @props_ref[tag]
80
+ when "Text" then @styled_text = cdx_text(data)
81
+ when "2DPosition" then @x, @y = read_value(tag, data)
82
+ when "BoundingBox" then @polygon = read_value(tag, data)
83
+ when "ChemicalWarning"
84
+ @warning = true
85
+ @warning_data = data
86
+ else do_unhandled(tag)
87
+ end
88
+ end
89
+
90
+ def pre_parse_node
91
+ return if @parser_type == "cdx"
92
+
93
+ @styled_text = cdxml_text(@parser.reader)
94
+ end
95
+
96
+ def post_parse_node
97
+ process_style
98
+ retrieve_bold_text
99
+
100
+ @center = Geometry::Point.new(@x, @y)
101
+ end
102
+
103
+ def remove_bold
104
+ @styled_text.delete_if { |s| (s[:face] & 1) == 1 }
105
+ process_style
106
+ end
107
+
108
+ def markdown
109
+ @styled_text.reduce("") do |md, style|
110
+ md += style[:bold] ? "**#{style[:text]}**" : style[:text]
111
+ md
112
+ end
113
+ end
114
+
115
+ def bolded_styles
116
+ @styled_text.select { |s| s[:bold] }
117
+ end
118
+
119
+ private
120
+
121
+ def process_style
122
+ pos_cur = 0
123
+ @styled_text.each do |style|
124
+ style[:text] = to_unicode(style[:text])
125
+ style[:text].gsub!(/\r\n?/, "\n")
126
+
127
+ style[:position] = pos_cur
128
+ tlength = style[:text].size
129
+ style[:length] = tlength
130
+ pos_cur += tlength
131
+
132
+ fidx = @parser.font_table.find_index { |f| f[:id] == style[:font] }
133
+ next if fidx.nil?
134
+
135
+ font = @parser.font_table[fidx]
136
+ if font[:name] == "Symbol" && style[:face] & 1 != 1
137
+ t = style[:text].gsub(Regexp.union(GREEK_CHARS.keys), GREEK_CHARS)
138
+ style[:text] = t + " "
139
+ end
140
+
141
+ # User use superscript "_" as minus
142
+ style[:text] = "-" if style[:face] == 64 && style[:text] == "_"
143
+ style[:text] = style[:text].gsub("–", "-")
144
+
145
+ style[:bold] = (style[:face] & 1) == 1
146
+ end
147
+
148
+ # If "3-6" bold, "-" is originally not BOLD. Same for bolded "2a,b"
149
+ # Set bold for single "middle" character
150
+ set_special_bold
151
+
152
+ # Merge previous continuous bold text
153
+ merge_bold
154
+ end
155
+
156
+ def set_special_bold
157
+ return if @styled_text.count < 2
158
+
159
+ bold_ids = []
160
+ @styled_text.each_with_index do |style, idx|
161
+ next unless style[:bold]
162
+
163
+ prev_idx = bold_ids.last
164
+ bold_ids.push(idx)
165
+ next if idx.zero?
166
+
167
+ prev = @styled_text[idx - 1]
168
+ check = (
169
+ style[:position] == (prev[:position] + prev[:length]) &&
170
+ prev[:text].strip.length == 1 &&
171
+ prev_idx == idx - 2
172
+ )
173
+ next unless check
174
+
175
+ prev[:bold] = true
176
+ end
177
+ end
178
+
179
+ def merge_bold
180
+ bold_ids = @styled_text.each_with_index.reduce([]) do |arr, (s, idx)|
181
+ arr.push(idx) if s[:bold]
182
+
183
+ arr
184
+ end
185
+ return if bold_ids.empty?
186
+
187
+ consecutive = [[bold_ids.last]]
188
+ bold_ids.reverse[1..-1].each do |idx|
189
+ sub_arr = consecutive.last
190
+
191
+ if sub_arr.last == idx + 1
192
+ sub_arr.push(idx)
193
+ else
194
+ consecutive.push([idx])
195
+ end
196
+ end
197
+ consecutive.reject! { |arr| arr.count == 1 }
198
+
199
+ consecutive.each do |ids|
200
+ ids[0..-2].each do |idx|
201
+ @styled_text[idx - 1][:text] += @styled_text[idx][:text]
202
+ @styled_text.delete_at(idx)
203
+ end
204
+ end
205
+ end
206
+
207
+ def to_unicode(text)
208
+ return text if text.encoding == Encoding::UTF_8
209
+
210
+ text.force_encoding(Encoding::CP1252)
211
+ text.encode(
212
+ Encoding::UTF_8,
213
+ invalid: :replace,
214
+ undef: :replace,
215
+ replace: "??",
216
+ )
217
+ end
218
+
219
+ def retrieve_bold_text
220
+ bold_arr, non_bold_arr = @styled_text.partition { |s| s[:bold] }
221
+ @bold_text = bold_arr.map { |x| x[:text] }.join(" ")
222
+ @bold_text.gsub!(/[,:\.] *$/, "")
223
+ @non_bold_text = non_bold_arr.map { |x| x[:text] }.join("")
224
+ @value = @styled_text.reduce("") { |mem, obj| "#{mem}#{obj[:text]}" }
225
+
226
+ # NOTE: Replace U+2219 to U+00B7
227
+ @bold_text = @bold_text.strip.gsub(/\r|\r\n/, "\n").gsub("∙", "·")
228
+ @non_bold_text = @non_bold_text.strip.
229
+ gsub(/\r|\r\n/, "\n").gsub("∙", "·")
230
+ @value = @value.strip.gsub(/\r|\r\n/, "\n").gsub("∙", "·")
231
+ end
232
+
233
+ def inspect
234
+ (
235
+ "#<Text: id=#{@id}, " +
236
+ "bold: #{@bold_text}, " +
237
+ "value: #{@value} >"
238
+ )
239
+ end
240
+ end
241
+ end
242
+ end