chem_scanner 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +13 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yml +604 -0
  5. data/.ruby-gemset +1 -0
  6. data/.ruby-version +1 -0
  7. data/.travis.yml +5 -0
  8. data/CODE_OF_CONDUCT.md +74 -0
  9. data/Gemfile +20 -0
  10. data/LICENSE.txt +661 -0
  11. data/README.md +177 -0
  12. data/Rakefile +8 -0
  13. data/bin/console +14 -0
  14. data/bin/setup +8 -0
  15. data/chem_scanner.gemspec +43 -0
  16. data/lib/chem_scanner.rb +79 -0
  17. data/lib/chem_scanner/cdx.rb +67 -0
  18. data/lib/chem_scanner/cdxml.rb +72 -0
  19. data/lib/chem_scanner/chem_draw/cdx_reader.rb +101 -0
  20. data/lib/chem_scanner/chem_draw/node/base_node.rb +123 -0
  21. data/lib/chem_scanner/chem_draw/node/base_value.rb +257 -0
  22. data/lib/chem_scanner/chem_draw/node/bond.rb +100 -0
  23. data/lib/chem_scanner/chem_draw/node/bracket_attachment.rb +17 -0
  24. data/lib/chem_scanner/chem_draw/node/bracket_group.rb +32 -0
  25. data/lib/chem_scanner/chem_draw/node/chem_geometry.rb +58 -0
  26. data/lib/chem_scanner/chem_draw/node/color_table.rb +46 -0
  27. data/lib/chem_scanner/chem_draw/node/font_table.rb +54 -0
  28. data/lib/chem_scanner/chem_draw/node/fragment.rb +149 -0
  29. data/lib/chem_scanner/chem_draw/node/fragment_node.rb +145 -0
  30. data/lib/chem_scanner/chem_draw/node/graphic.rb +94 -0
  31. data/lib/chem_scanner/chem_draw/node/text.rb +242 -0
  32. data/lib/chem_scanner/chem_draw/parser.rb +214 -0
  33. data/lib/chem_scanner/chem_draw/yaml/cdx_objects.yaml +32 -0
  34. data/lib/chem_scanner/chem_draw/yaml/cdx_props.yaml +263 -0
  35. data/lib/chem_scanner/chem_draw/yaml/cdxml_objects.yaml +36 -0
  36. data/lib/chem_scanner/chem_draw/yaml/cdxml_props.yaml +263 -0
  37. data/lib/chem_scanner/chem_draw/yaml/props_data_type.yaml +263 -0
  38. data/lib/chem_scanner/configuration/abbreviation.rb +76 -0
  39. data/lib/chem_scanner/configuration/superatom.rb +76 -0
  40. data/lib/chem_scanner/configuration/superatom.txt +2874 -0
  41. data/lib/chem_scanner/configuration/util.rb +40 -0
  42. data/lib/chem_scanner/configuration/yaml/abbreviations.yaml +6399 -0
  43. data/lib/chem_scanner/configuration/yaml/elements.yaml +115 -0
  44. data/lib/chem_scanner/configuration/yaml/solvents.yaml +16 -0
  45. data/lib/chem_scanner/doc.rb +56 -0
  46. data/lib/chem_scanner/docx.rb +86 -0
  47. data/lib/chem_scanner/export/cml.rb +176 -0
  48. data/lib/chem_scanner/extension/element_map.rb +9 -0
  49. data/lib/chem_scanner/extension/geometry/bounding_box.rb +84 -0
  50. data/lib/chem_scanner/extension/geometry/line.rb +123 -0
  51. data/lib/chem_scanner/extension/geometry/point.rb +18 -0
  52. data/lib/chem_scanner/extension/geometry/polygon.rb +115 -0
  53. data/lib/chem_scanner/extension/geometry/segment.rb +196 -0
  54. data/lib/chem_scanner/extension/passthrough.rb +7 -0
  55. data/lib/chem_scanner/interpreter/element/arrow.rb +298 -0
  56. data/lib/chem_scanner/interpreter/element/atom.rb +134 -0
  57. data/lib/chem_scanner/interpreter/element/fragment.rb +59 -0
  58. data/lib/chem_scanner/interpreter/element/molecule.rb +473 -0
  59. data/lib/chem_scanner/interpreter/element/molecule_group.rb +34 -0
  60. data/lib/chem_scanner/interpreter/element/reaction.rb +186 -0
  61. data/lib/chem_scanner/interpreter/element/reaction_step.rb +39 -0
  62. data/lib/chem_scanner/interpreter/formula_to_mol.rb +75 -0
  63. data/lib/chem_scanner/interpreter/post_process/assemble.rb +38 -0
  64. data/lib/chem_scanner/interpreter/post_process/label_by_molecule.rb +37 -0
  65. data/lib/chem_scanner/interpreter/post_process/reaction_info.rb +225 -0
  66. data/lib/chem_scanner/interpreter/post_process/reaction_step.rb +95 -0
  67. data/lib/chem_scanner/interpreter/post_process/reagent_label.rb +46 -0
  68. data/lib/chem_scanner/interpreter/post_process/text_as_molecule.rb +52 -0
  69. data/lib/chem_scanner/interpreter/post_process/text_label.rb +40 -0
  70. data/lib/chem_scanner/interpreter/pre_process/arrow.rb +197 -0
  71. data/lib/chem_scanner/interpreter/pre_process/graphic.rb +41 -0
  72. data/lib/chem_scanner/interpreter/pre_process/molecule.rb +150 -0
  73. data/lib/chem_scanner/interpreter/reaction_detection/assign_to_reaction.rb +129 -0
  74. data/lib/chem_scanner/interpreter/reaction_detection/duplicate_reagents.rb +50 -0
  75. data/lib/chem_scanner/interpreter/reaction_detection/molecule_group.rb +55 -0
  76. data/lib/chem_scanner/interpreter/reaction_detection/multi_line_chain_reaction.rb +85 -0
  77. data/lib/chem_scanner/interpreter/reaction_detection/remove_separated_mol.rb +115 -0
  78. data/lib/chem_scanner/interpreter/reaction_detection/text_assignment.rb +166 -0
  79. data/lib/chem_scanner/interpreter/scheme.rb +173 -0
  80. data/lib/chem_scanner/interpreter/scheme_base.rb +64 -0
  81. data/lib/chem_scanner/interpreter/text_group/bold_groups.rb +183 -0
  82. data/lib/chem_scanner/interpreter/text_group/molecule_text_group.rb +138 -0
  83. data/lib/chem_scanner/interpreter/text_group/reaction_text_groups.rb +221 -0
  84. data/lib/chem_scanner/interpreter/text_group/retrieve_alias_info.rb +41 -0
  85. data/lib/chem_scanner/interpreter/text_group/retrieve_n_atoms.rb +106 -0
  86. data/lib/chem_scanner/interpreter/text_group/text_group_interpreter.rb +92 -0
  87. data/lib/chem_scanner/perkin_eln.rb +287 -0
  88. data/lib/chem_scanner/version.rb +5 -0
  89. data/lib/rubygems_plugin.rb +5 -0
  90. metadata +244 -0
@@ -0,0 +1,145 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ module ChemDraw
5
+ ALIAS_VALUES = [0, 4, 5, 8, 12].freeze
6
+
7
+ # CDX Node parser
8
+ class FragmentNode < BaseNode
9
+ require "chem_scanner/chem_draw/node/fragment"
10
+ require "chem_scanner/chem_draw/node/text"
11
+
12
+ attr_accessor :num_hydrogens, :atnum, :spin, :charge, :iso, :type,
13
+ :ext_type, :x, :y, :is_alias, :alias_text,
14
+ :warning, :warning_data, :fragment, :nested_fragment,
15
+ :nested_text, :color, :expanded, :point, :is_polymer
16
+
17
+ def initialize(parser, parser_type, id)
18
+ super(parser, parser_type, id)
19
+
20
+ @num_hydrogens = -1
21
+ @atnum = -1
22
+ @spin = 0
23
+ @charge = 0
24
+ @iso = 0
25
+ @color = 0
26
+ @type = -1
27
+ @ext_type = -1
28
+ @alias_text = ""
29
+ @warning = false
30
+ @warning_data = ""
31
+ @is_alias = false
32
+
33
+ @nested_fragment = {}
34
+ @nested_text = {}
35
+ @expanded = false
36
+ @is_polymer = false
37
+ end
38
+
39
+ # rubocop:disable Methods/PerceivedComplexity
40
+ def parse_node(tag, nid, data)
41
+ ref = @props_ref[tag]
42
+ ref = ref.nil? ? @obj_ref[tag] : ref
43
+
44
+ case ref
45
+ when "Node_Element" then @atnum = read_value(tag, data)
46
+ when "Atom_Radical" then @spin = read_value(tag, data)
47
+ when "Atom_Isotope" then @iso = read_value(tag, data)
48
+ when "Fragment"
49
+ frag = Fragment.new(@parser, @parser_type, nid)
50
+ frag.read
51
+ @nested_fragment[nid] = frag
52
+ when "Atom_GenericNickname"
53
+ nickname = send("#{@parser_type}_text", data)
54
+ @generic_nickname = nickname.first[:text] unless nickname.empty?
55
+ when "Node_Type"
56
+ @type = read_type(tag, data, CDXML_NODE_TYPE)
57
+ @is_alias = ALIAS_VALUES.include?(@type)
58
+ when "2DPosition" then @x, @y = read_value(tag, data)
59
+ when "Atom_Charge" then @charge = read_value(tag, data)
60
+ when "Text"
61
+ @text = Text.new(@parser, @parser_type, nid, true)
62
+ # NOTE: MUST read first in order to maintain CDX reader
63
+ @text.read
64
+ @polygon = @text.polygon
65
+
66
+ @nested_text[@text.id] = @text
67
+ when "ChemicalWarning"
68
+ @warning = true
69
+ @warning_data = @parser_type == "cdxml" ? data.text : data
70
+ when "Atom_NumHydrogens" then @num_hydrogens = read_value(tag, data)
71
+ when "ForegroundColor" then @color = read_value(tag, data)
72
+ when "Atom_ExternalConnectionType"
73
+ @ext_type = read_type(tag, data, CDXML_ATOM_EXTERNAL_CONNECTION_TYPE)
74
+ else do_unhandled(tag)
75
+ end
76
+ end
77
+ # rubocop:enable Methods/PerceivedComplexity
78
+
79
+ def post_parse_node
80
+ @point = Geometry::Point.new(@x, @y)
81
+
82
+ if !@text.nil? && !@text.value.empty?
83
+ @alias_text = @text.value
84
+ return
85
+ end
86
+
87
+ interpreter = ChemScanner::Interpreter
88
+ if !@generic_nickname.nil? &&
89
+ interpreter.rgroup_atom?(@generic_nickname)
90
+ @is_alias = true
91
+ @type = 7
92
+ @alias_text = @generic_nickname
93
+ end
94
+ end
95
+
96
+ def leftbottom
97
+ @polygon.nil? ? point : @polygon.bounding_box.leftbottom
98
+ end
99
+
100
+ def righttop
101
+ @polygon.nil? ? point : @polygon.bounding_box.righttop
102
+ end
103
+
104
+ def has_nil_coord?
105
+ (@x.nil? || @y.nil?) && @polygon.nil?
106
+ end
107
+
108
+ def set_type(type)
109
+ @type = type
110
+ end
111
+
112
+ def set_expanded
113
+ @expanded = true
114
+ end
115
+
116
+ def set_is_polymer
117
+ @is_alias = true
118
+ @is_polymer = true
119
+ end
120
+
121
+ def clone
122
+ cloned = self.class.new(@parser, @parser_type, nil)
123
+ cloned.num_hydrogens = @num_hydrogens
124
+ cloned.atnum = @atnum
125
+ cloned.spin = @spin
126
+ cloned.charge = @charge
127
+ cloned.iso = @iso
128
+ cloned.color = @color
129
+ cloned.type = @type
130
+ cloned.alias_text = @alias_text
131
+ cloned.warning = @warning
132
+ cloned.warning_data = @warning_data
133
+ cloned.is_alias = @is_alias
134
+ cloned.expanded = @expanded
135
+
136
+ cloned.nested_fragment = {}
137
+ cloned.nested_text = {}
138
+ @nested_fragment.each { |k, v| cloned.nested_fragment[k] = v }
139
+ @nested_text.each { |k, v| cloned.nested_text[k] = v }
140
+
141
+ cloned
142
+ end
143
+ end
144
+ end
145
+ end
@@ -0,0 +1,94 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ module ChemDraw
5
+ # CDX Graphic parser
6
+ class Graphic < BaseNode
7
+ attr_reader :arrow_id, :type, :arrow_head, :head, :tail,
8
+ :line_type, :orbital_type, :oval_type, :polygon
9
+
10
+ GRAPHIC_BRACKET_TYPE = 6
11
+
12
+ def initialize(parser, parser_type, id)
13
+ super(parser, parser_type, id)
14
+
15
+ @line_type = 0
16
+ end
17
+
18
+ def parse_node(tag, _id, data)
19
+ case @props_ref[tag]
20
+ when "Arrow_Type"
21
+ @arrow_head = read_type(tag, data, CDXML_ARROW_TYPE)
22
+ when "Line_Type"
23
+ @line_type = read_type(tag, data, CDXML_LINE_TYPE)
24
+ when "Graphic_Type"
25
+ @type = read_type(tag, data, CDXML_GRAPHIC_TYPE)
26
+ # Graphic objects are the only objects whose kCDXProp_BoundingBox
27
+ # property has a special meaning, representing a pair of points
28
+ # rather than a rectangle.
29
+ when "BoundingBox" then @polygon = read_value(tag, data)
30
+ when "SupersededBy" then @arrow_id = read_value(tag, data)
31
+ when "3DMajorAxisEnd" then @right, @top = read_value(tag, data)
32
+ when "3DMinorAxisEnd" then @left, @bottom = read_value(tag, data)
33
+ when "Orbital_Type"
34
+ @orbital_type = read_type(tag, data, CDXML_ORBITAL_TYPE)
35
+ when "Oval_Type"
36
+ @oval_type = read_type(tag, data, CDXML_OVAL_TYPE)
37
+ else do_unhandled(tag)
38
+ end
39
+ end
40
+
41
+ def post_parse_node
42
+ # When dealing with orbital, boundingbox is not reliable
43
+ build_orbital_polygon if @type == 5
44
+
45
+ # In case of Graphic is arrow
46
+ # Treat as arrow if is a line, no "SupersededBy" and has "BoundingBox"
47
+ return unless @type == 1 && @arrow_id.nil? && !@polygon.nil?
48
+
49
+ vertices = @polygon.vertices
50
+ # start point ~ head
51
+ sp = vertices[1]
52
+ # end point ~ tail
53
+ ep = vertices[3]
54
+
55
+ @head = { x: sp.x, y: sp.y }
56
+ @tail = { x: ep.x, y: ep.y }
57
+ end
58
+
59
+ def build_orbital_polygon
60
+ return unless @orbital_type == 256 && @oval_type == 3
61
+
62
+ p1 = Geometry::Point.new(@left, @bottom)
63
+ p2 = Geometry::Point.new(@left, @top)
64
+ p3 = Geometry::Point.new(@right, @top)
65
+ p4 = Geometry::Point.new(@right, @bottom)
66
+
67
+ @polygon = Geometry::Polygon.new([p1, p2, p3, p4])
68
+ end
69
+
70
+ def line?
71
+ @type == 1 && @arrow_id.nil? && (@arrow_head.nil? || @arrow_head.zero?)
72
+ end
73
+
74
+ def segment
75
+ Geometry::Segment.new_by_arrays(
76
+ [@tail[:x], @tail[:y]],
77
+ [@head[:x], @head[:y]],
78
+ )
79
+ end
80
+
81
+ def vector
82
+ segment.to_vector
83
+ end
84
+
85
+ def line
86
+ segment.to_line
87
+ end
88
+
89
+ def cross?
90
+ false
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,242 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ module ChemScanner
5
+ module ChemDraw
6
+ # Text parser
7
+ class Text < BaseNode
8
+ attr_accessor :warning, :warning_data, :x, :y, :styled_text, :value,
9
+ :center, :polygon, :bold_text, :non_bold_text
10
+
11
+ GREEK_CHARS = {
12
+ "A" => "Α",
13
+ "a" => "α",
14
+ "B" => "Β",
15
+ "b" => "β",
16
+ "G" => "Γ",
17
+ "g" => "γ",
18
+ "D" => "Δ",
19
+ "d" => "δ",
20
+ "E" => "Ε",
21
+ "e" => "ε",
22
+ "Z" => "Ζ",
23
+ "z" => "ζ",
24
+ "H" => "Η",
25
+ "h" => "η",
26
+ "Q" => "Θ",
27
+ "q" => "θ",
28
+ "I" => "Ι",
29
+ "i" => "ι",
30
+ "K" => "Κ",
31
+ "k" => "κ",
32
+ "L" => "Λ",
33
+ "l" => "λ",
34
+ "M" => "Μ",
35
+ "m" => "μ",
36
+ "N" => "Ν",
37
+ "n" => "ν",
38
+ "C" => "Ξ",
39
+ "c" => "ξ",
40
+ "O" => "Ο",
41
+ "o" => "ο",
42
+ "P" => "Π",
43
+ "p" => "π",
44
+ "R" => "Ρ",
45
+ "r" => "ρ",
46
+ "S" => "Σ",
47
+ "s" => "σ",
48
+ "T" => "Τ",
49
+ "t" => "τ",
50
+ "U" => "Υ",
51
+ "u" => "υ",
52
+ "F" => "Φ",
53
+ "f" => "φ",
54
+ "X" => "Χ",
55
+ "x" => "χ",
56
+ "Y" => "Ψ",
57
+ "y" => "ψ",
58
+ "W" => "Ω",
59
+ "w" => "ω",
60
+ }.freeze
61
+
62
+ BOLD_VAL = 0x01
63
+ FONT_KEY = "face"
64
+ COLOR_KEY = "color"
65
+
66
+ def initialize(parser, parser_type, id, is_alias = false)
67
+ super(parser, parser_type, id)
68
+
69
+ @warning = false
70
+ @is_alias = is_alias
71
+
72
+ @bold_text = ""
73
+ @value = ""
74
+ end
75
+
76
+ def parse_node(tag, _id, data)
77
+ # NOTE: CDXML text does not have tag
78
+ # "Text" below only happens for CDX
79
+ case @props_ref[tag]
80
+ when "Text" then @styled_text = cdx_text(data)
81
+ when "2DPosition" then @x, @y = read_value(tag, data)
82
+ when "BoundingBox" then @polygon = read_value(tag, data)
83
+ when "ChemicalWarning"
84
+ @warning = true
85
+ @warning_data = data
86
+ else do_unhandled(tag)
87
+ end
88
+ end
89
+
90
+ def pre_parse_node
91
+ return if @parser_type == "cdx"
92
+
93
+ @styled_text = cdxml_text(@parser.reader)
94
+ end
95
+
96
+ def post_parse_node
97
+ process_style
98
+ retrieve_bold_text
99
+
100
+ @center = Geometry::Point.new(@x, @y)
101
+ end
102
+
103
+ def remove_bold
104
+ @styled_text.delete_if { |s| (s[:face] & 1) == 1 }
105
+ process_style
106
+ end
107
+
108
+ def markdown
109
+ @styled_text.reduce("") do |md, style|
110
+ md += style[:bold] ? "**#{style[:text]}**" : style[:text]
111
+ md
112
+ end
113
+ end
114
+
115
+ def bolded_styles
116
+ @styled_text.select { |s| s[:bold] }
117
+ end
118
+
119
+ private
120
+
121
+ def process_style
122
+ pos_cur = 0
123
+ @styled_text.each do |style|
124
+ style[:text] = to_unicode(style[:text])
125
+ style[:text].gsub!(/\r\n?/, "\n")
126
+
127
+ style[:position] = pos_cur
128
+ tlength = style[:text].size
129
+ style[:length] = tlength
130
+ pos_cur += tlength
131
+
132
+ fidx = @parser.font_table.find_index { |f| f[:id] == style[:font] }
133
+ next if fidx.nil?
134
+
135
+ font = @parser.font_table[fidx]
136
+ if font[:name] == "Symbol" && style[:face] & 1 != 1
137
+ t = style[:text].gsub(Regexp.union(GREEK_CHARS.keys), GREEK_CHARS)
138
+ style[:text] = t + " "
139
+ end
140
+
141
+ # User use superscript "_" as minus
142
+ style[:text] = "-" if style[:face] == 64 && style[:text] == "_"
143
+ style[:text] = style[:text].gsub("–", "-")
144
+
145
+ style[:bold] = (style[:face] & 1) == 1
146
+ end
147
+
148
+ # If "3-6" bold, "-" is originally not BOLD. Same for bolded "2a,b"
149
+ # Set bold for single "middle" character
150
+ set_special_bold
151
+
152
+ # Merge previous continuous bold text
153
+ merge_bold
154
+ end
155
+
156
+ def set_special_bold
157
+ return if @styled_text.count < 2
158
+
159
+ bold_ids = []
160
+ @styled_text.each_with_index do |style, idx|
161
+ next unless style[:bold]
162
+
163
+ prev_idx = bold_ids.last
164
+ bold_ids.push(idx)
165
+ next if idx.zero?
166
+
167
+ prev = @styled_text[idx - 1]
168
+ check = (
169
+ style[:position] == (prev[:position] + prev[:length]) &&
170
+ prev[:text].strip.length == 1 &&
171
+ prev_idx == idx - 2
172
+ )
173
+ next unless check
174
+
175
+ prev[:bold] = true
176
+ end
177
+ end
178
+
179
+ def merge_bold
180
+ bold_ids = @styled_text.each_with_index.reduce([]) do |arr, (s, idx)|
181
+ arr.push(idx) if s[:bold]
182
+
183
+ arr
184
+ end
185
+ return if bold_ids.empty?
186
+
187
+ consecutive = [[bold_ids.last]]
188
+ bold_ids.reverse[1..-1].each do |idx|
189
+ sub_arr = consecutive.last
190
+
191
+ if sub_arr.last == idx + 1
192
+ sub_arr.push(idx)
193
+ else
194
+ consecutive.push([idx])
195
+ end
196
+ end
197
+ consecutive.reject! { |arr| arr.count == 1 }
198
+
199
+ consecutive.each do |ids|
200
+ ids[0..-2].each do |idx|
201
+ @styled_text[idx - 1][:text] += @styled_text[idx][:text]
202
+ @styled_text.delete_at(idx)
203
+ end
204
+ end
205
+ end
206
+
207
+ def to_unicode(text)
208
+ return text if text.encoding == Encoding::UTF_8
209
+
210
+ text.force_encoding(Encoding::CP1252)
211
+ text.encode(
212
+ Encoding::UTF_8,
213
+ invalid: :replace,
214
+ undef: :replace,
215
+ replace: "??",
216
+ )
217
+ end
218
+
219
+ def retrieve_bold_text
220
+ bold_arr, non_bold_arr = @styled_text.partition { |s| s[:bold] }
221
+ @bold_text = bold_arr.map { |x| x[:text] }.join(" ")
222
+ @bold_text.gsub!(/[,:\.] *$/, "")
223
+ @non_bold_text = non_bold_arr.map { |x| x[:text] }.join("")
224
+ @value = @styled_text.reduce("") { |mem, obj| "#{mem}#{obj[:text]}" }
225
+
226
+ # NOTE: Replace U+2219 to U+00B7
227
+ @bold_text = @bold_text.strip.gsub(/\r|\r\n/, "\n").gsub("∙", "·")
228
+ @non_bold_text = @non_bold_text.strip.
229
+ gsub(/\r|\r\n/, "\n").gsub("∙", "·")
230
+ @value = @value.strip.gsub(/\r|\r\n/, "\n").gsub("∙", "·")
231
+ end
232
+
233
+ def inspect
234
+ (
235
+ "#<Text: id=#{@id}, " +
236
+ "bold: #{@bold_text}, " +
237
+ "value: #{@value} >"
238
+ )
239
+ end
240
+ end
241
+ end
242
+ end