chem_scanner 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +13 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yml +604 -0
  5. data/.ruby-gemset +1 -0
  6. data/.ruby-version +1 -0
  7. data/.travis.yml +5 -0
  8. data/CODE_OF_CONDUCT.md +74 -0
  9. data/Gemfile +20 -0
  10. data/LICENSE.txt +661 -0
  11. data/README.md +177 -0
  12. data/Rakefile +8 -0
  13. data/bin/console +14 -0
  14. data/bin/setup +8 -0
  15. data/chem_scanner.gemspec +43 -0
  16. data/lib/chem_scanner.rb +79 -0
  17. data/lib/chem_scanner/cdx.rb +67 -0
  18. data/lib/chem_scanner/cdxml.rb +72 -0
  19. data/lib/chem_scanner/chem_draw/cdx_reader.rb +101 -0
  20. data/lib/chem_scanner/chem_draw/node/base_node.rb +123 -0
  21. data/lib/chem_scanner/chem_draw/node/base_value.rb +257 -0
  22. data/lib/chem_scanner/chem_draw/node/bond.rb +100 -0
  23. data/lib/chem_scanner/chem_draw/node/bracket_attachment.rb +17 -0
  24. data/lib/chem_scanner/chem_draw/node/bracket_group.rb +32 -0
  25. data/lib/chem_scanner/chem_draw/node/chem_geometry.rb +58 -0
  26. data/lib/chem_scanner/chem_draw/node/color_table.rb +46 -0
  27. data/lib/chem_scanner/chem_draw/node/font_table.rb +54 -0
  28. data/lib/chem_scanner/chem_draw/node/fragment.rb +149 -0
  29. data/lib/chem_scanner/chem_draw/node/fragment_node.rb +145 -0
  30. data/lib/chem_scanner/chem_draw/node/graphic.rb +94 -0
  31. data/lib/chem_scanner/chem_draw/node/text.rb +242 -0
  32. data/lib/chem_scanner/chem_draw/parser.rb +214 -0
  33. data/lib/chem_scanner/chem_draw/yaml/cdx_objects.yaml +32 -0
  34. data/lib/chem_scanner/chem_draw/yaml/cdx_props.yaml +263 -0
  35. data/lib/chem_scanner/chem_draw/yaml/cdxml_objects.yaml +36 -0
  36. data/lib/chem_scanner/chem_draw/yaml/cdxml_props.yaml +263 -0
  37. data/lib/chem_scanner/chem_draw/yaml/props_data_type.yaml +263 -0
  38. data/lib/chem_scanner/configuration/abbreviation.rb +76 -0
  39. data/lib/chem_scanner/configuration/superatom.rb +76 -0
  40. data/lib/chem_scanner/configuration/superatom.txt +2874 -0
  41. data/lib/chem_scanner/configuration/util.rb +40 -0
  42. data/lib/chem_scanner/configuration/yaml/abbreviations.yaml +6399 -0
  43. data/lib/chem_scanner/configuration/yaml/elements.yaml +115 -0
  44. data/lib/chem_scanner/configuration/yaml/solvents.yaml +16 -0
  45. data/lib/chem_scanner/doc.rb +56 -0
  46. data/lib/chem_scanner/docx.rb +86 -0
  47. data/lib/chem_scanner/export/cml.rb +176 -0
  48. data/lib/chem_scanner/extension/element_map.rb +9 -0
  49. data/lib/chem_scanner/extension/geometry/bounding_box.rb +84 -0
  50. data/lib/chem_scanner/extension/geometry/line.rb +123 -0
  51. data/lib/chem_scanner/extension/geometry/point.rb +18 -0
  52. data/lib/chem_scanner/extension/geometry/polygon.rb +115 -0
  53. data/lib/chem_scanner/extension/geometry/segment.rb +196 -0
  54. data/lib/chem_scanner/extension/passthrough.rb +7 -0
  55. data/lib/chem_scanner/interpreter/element/arrow.rb +298 -0
  56. data/lib/chem_scanner/interpreter/element/atom.rb +134 -0
  57. data/lib/chem_scanner/interpreter/element/fragment.rb +59 -0
  58. data/lib/chem_scanner/interpreter/element/molecule.rb +473 -0
  59. data/lib/chem_scanner/interpreter/element/molecule_group.rb +34 -0
  60. data/lib/chem_scanner/interpreter/element/reaction.rb +186 -0
  61. data/lib/chem_scanner/interpreter/element/reaction_step.rb +39 -0
  62. data/lib/chem_scanner/interpreter/formula_to_mol.rb +75 -0
  63. data/lib/chem_scanner/interpreter/post_process/assemble.rb +38 -0
  64. data/lib/chem_scanner/interpreter/post_process/label_by_molecule.rb +37 -0
  65. data/lib/chem_scanner/interpreter/post_process/reaction_info.rb +225 -0
  66. data/lib/chem_scanner/interpreter/post_process/reaction_step.rb +95 -0
  67. data/lib/chem_scanner/interpreter/post_process/reagent_label.rb +46 -0
  68. data/lib/chem_scanner/interpreter/post_process/text_as_molecule.rb +52 -0
  69. data/lib/chem_scanner/interpreter/post_process/text_label.rb +40 -0
  70. data/lib/chem_scanner/interpreter/pre_process/arrow.rb +197 -0
  71. data/lib/chem_scanner/interpreter/pre_process/graphic.rb +41 -0
  72. data/lib/chem_scanner/interpreter/pre_process/molecule.rb +150 -0
  73. data/lib/chem_scanner/interpreter/reaction_detection/assign_to_reaction.rb +129 -0
  74. data/lib/chem_scanner/interpreter/reaction_detection/duplicate_reagents.rb +50 -0
  75. data/lib/chem_scanner/interpreter/reaction_detection/molecule_group.rb +55 -0
  76. data/lib/chem_scanner/interpreter/reaction_detection/multi_line_chain_reaction.rb +85 -0
  77. data/lib/chem_scanner/interpreter/reaction_detection/remove_separated_mol.rb +115 -0
  78. data/lib/chem_scanner/interpreter/reaction_detection/text_assignment.rb +166 -0
  79. data/lib/chem_scanner/interpreter/scheme.rb +173 -0
  80. data/lib/chem_scanner/interpreter/scheme_base.rb +64 -0
  81. data/lib/chem_scanner/interpreter/text_group/bold_groups.rb +183 -0
  82. data/lib/chem_scanner/interpreter/text_group/molecule_text_group.rb +138 -0
  83. data/lib/chem_scanner/interpreter/text_group/reaction_text_groups.rb +221 -0
  84. data/lib/chem_scanner/interpreter/text_group/retrieve_alias_info.rb +41 -0
  85. data/lib/chem_scanner/interpreter/text_group/retrieve_n_atoms.rb +106 -0
  86. data/lib/chem_scanner/interpreter/text_group/text_group_interpreter.rb +92 -0
  87. data/lib/chem_scanner/perkin_eln.rb +287 -0
  88. data/lib/chem_scanner/version.rb +5 -0
  89. data/lib/rubygems_plugin.rb +5 -0
  90. metadata +244 -0
@@ -0,0 +1,100 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ module ChemDraw
5
+ CDX_BOND_ORDER = {
6
+ 0x0001 => 1,
7
+ 0x0002 => 2,
8
+ 0x0004 => 3,
9
+ 0x0008 => 4,
10
+ 0x0010 => 5,
11
+ 0x0020 => 6,
12
+ 0x0040 => 0.5,
13
+ 0x0080 => 1.5,
14
+ 0x0100 => 2.5,
15
+ 0x0200 => 3.5,
16
+ 0x0400 => 4.5,
17
+ 0x0800 => 5.5,
18
+ 0x1000 => "dative",
19
+ 0x2000 => "ionic",
20
+ 0x4000 => "hydrogen",
21
+ }.freeze
22
+
23
+ CDXML_BOND_DISPLAY = {
24
+ "Solid" => 0,
25
+ "Dash" => 1,
26
+ "Hash" => 2,
27
+ "WedgedHashBegin" => 3,
28
+ "WedgedHashEnd" => 4,
29
+ "Bold" => 5,
30
+ "WedgeBegin" => 6,
31
+ "WedgeEnd" => 7,
32
+ "Wavy" => 8,
33
+ "HollowWedgeBegin" => 9,
34
+ "HollowWedgeEnd" => 10,
35
+ "WavyWedgeBegin" => 11,
36
+ "WavyWedgeEnd" => 12,
37
+ "Dot" => 13,
38
+ "DashDot" => 14,
39
+ }.freeze
40
+
41
+ # CDX Bond parser
42
+ class Bond < BaseNode
43
+ attr_accessor :begin_id, :end_id, :stereo, :order, :color
44
+
45
+ def initialize(parser, parser_type, id)
46
+ super(parser, parser_type, id)
47
+
48
+ @begin_id = nil
49
+ @end_id = nil
50
+ @stereo = 0
51
+ @order = 1
52
+
53
+ @color = 0
54
+ end
55
+
56
+ def parse_node(tag, _id, data)
57
+ case @props_ref[tag]
58
+ when "Bond_Begin" then @begin_id = read_value(tag, data)
59
+ when "Bond_End" then @end_id = read_value(tag, data)
60
+ when "Bond_Order" then @order = bond_order(read_value(tag, data))
61
+ when "Bond_Display" then @stereo = bond_display(tag, data)
62
+ when "ForegroundColor" then @color = read_value(tag, data)
63
+ else do_unhandled(tag)
64
+ end
65
+ end
66
+
67
+ def bond_order(val)
68
+ return val if @parser_type == "cdxml"
69
+
70
+ CDX_BOND_ORDER[val] || 0
71
+ end
72
+
73
+ def bond_display(tag, data)
74
+ return read_value(tag, data) if @parser_type == "cdx"
75
+
76
+ CDXML_BOND_DISPLAY[data.text]
77
+ end
78
+
79
+ def end_points
80
+ [@begin_id, @end_id]
81
+ end
82
+
83
+ def replace_endpoint(endpoint, new_point)
84
+ if @begin_id == endpoint
85
+ @begin_id = new_point
86
+ elsif @end_id == endpoint
87
+ @end_id = new_point
88
+ end
89
+ end
90
+
91
+ def other_endpoint(endpoint)
92
+ endpoint == @begin_id ? @end_id : @begin_id
93
+ end
94
+
95
+ def has_endpoint?(id)
96
+ [@begin_id, @end_id].include?(id)
97
+ end
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ module ChemDraw
5
+ # CDX Bracket parser
6
+ class BracketAttachment < BaseNode
7
+ attr_reader :graphic_id
8
+
9
+ def parse_node(tag, _id, data)
10
+ ref = @props_ref[tag] || @obj_ref[tag]
11
+ return unless ref == "Bracket_GraphicID"
12
+
13
+ @graphic_id = read_value(tag, data)
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ module ChemDraw
5
+ # CDX Bracket parser
6
+ class BracketGroup < BaseNode
7
+ require "chem_scanner/chem_draw/node/bracket_attachment"
8
+
9
+ attr_reader :attachments, :object_ids
10
+
11
+ def initialize(parser, parser_type, id)
12
+ super(parser, parser_type, id)
13
+
14
+ @attachments = []
15
+ @object_ids = []
16
+ end
17
+
18
+ def parse_node(tag, id, data)
19
+ if @props_ref[tag] == "BracketedObjects"
20
+ @object_ids = read_value(tag, data)
21
+ return
22
+ end
23
+
24
+ return do_unhandled(tag) unless @obj_ref[tag] == "BracketAttachment"
25
+
26
+ attachment = BracketAttachment.new(@parser, @parser_type, id)
27
+ attachment.read
28
+ @attachments.push(attachment)
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ module ChemDraw
5
+ # Geometry parser
6
+ class ChemGeometry < BaseNode
7
+ attr_reader :tail, :head, :head_type, :nogo, :line_type
8
+
9
+ def initialize(parser, parser_type, id)
10
+ super(parser, parser_type, id)
11
+
12
+ @middle_points = []
13
+ @line_type = 0
14
+ end
15
+
16
+ # NOTE: head ----> tail (head at tail)
17
+ def parse_node(tag, _nid, data)
18
+ case @props_ref[tag]
19
+ when "3DTail"
20
+ x, y = read_value(tag, data)
21
+ @tail = { x: x, y: y }
22
+ when "3DHead"
23
+ x, y = read_value(tag, data)
24
+ @head = { x: x, y: y }
25
+ when "Arrow_ArrowHead_Head"
26
+ @arrow_head = read_type(tag, data, CDXML_ARROW_TYPE)
27
+ when "Arrow_NoGo" then @nogo = read_value(tag, data)
28
+ when "Line_Type"
29
+ @line_type = read_type(tag, data, CDXML_LINE_TYPE)
30
+ else do_unhandled(tag)
31
+ end
32
+ end
33
+
34
+ def segment
35
+ Geometry::Segment.new_by_arrays(
36
+ [@tail[:x], @tail[:y]],
37
+ [@head[:x], @head[:y]],
38
+ )
39
+ end
40
+
41
+ def vector
42
+ segment.to_vector
43
+ end
44
+
45
+ def line
46
+ segment.to_line
47
+ end
48
+
49
+ def headless
50
+ @arrow_head != 2
51
+ end
52
+
53
+ def cross?
54
+ !@nogo.nil?
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,46 @@
1
+ module ChemScanner
2
+ module ChemDraw
3
+ # ColorTable
4
+ class ColorTable < BaseNode
5
+ attr_reader :table
6
+
7
+ def initialize(parser_type, data)
8
+ @parser_type = parser_type
9
+ @data = data
10
+ end
11
+
12
+ def read
13
+ @parser_type == "cdx" ? read_cdx : read_cdxml
14
+ end
15
+
16
+ def read_cdx
17
+ @nums = read_int(@data[0, 2], true)
18
+ rgbs = binary_chunks(@data[2..-1], 2).map { |x| read_int(x, true) }
19
+
20
+ table = rgbs.each_slice(3).to_a.map do |x|
21
+ x.reduce("") do |memo, c|
22
+ rgb = c >> 8
23
+ memo << rgb.to_s(16).rjust(2, "0")
24
+ end
25
+ end
26
+
27
+ @table = %w[000000 FFFFFF] + table
28
+ end
29
+
30
+ def read_cdxml
31
+ table = @data.element_children.each_with_object([]) do |color, t|
32
+ next if color.name != "color"
33
+
34
+ rgb = %w[r g b].reduce("") do |memo, c|
35
+ ct = color.attr(c).to_i * 255
36
+ memo << ct.to_s(16).rjust(2, "0")
37
+ end
38
+
39
+ t.push(rgb)
40
+ end
41
+
42
+ @table = %w[000000 FFFFFF] + table
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ module ChemDraw
5
+ # CDX Graphic parser
6
+ class FontTable < BaseNode
7
+ attr_reader :table
8
+
9
+ def initialize(parser_type, data)
10
+ @parser_type = parser_type
11
+ @data = data
12
+ end
13
+
14
+ def read
15
+ @parser_type == "cdx" ? read_cdx : read_cdxml
16
+ end
17
+
18
+ def read_cdx
19
+ @table = []
20
+ run_count = read_int(@data[2, 2], true)
21
+
22
+ iter = 4
23
+ (1..run_count).each do
24
+ font, length = read_cdx_font_attribute(iter)
25
+ @table.push(font)
26
+ iter += 6 + length
27
+ end
28
+
29
+ @table
30
+ end
31
+
32
+ def read_cdx_font_attribute(iter)
33
+ id = read_int(@data[iter, 2], true)
34
+ charset = read_int(@data[iter + 2, 2], true)
35
+ length = read_int(@data[iter + 4, 2], true)
36
+ name = @data[iter + 6, length]
37
+
38
+ [{ id: id, charset: charset, name: name }, length]
39
+ end
40
+
41
+ def read_cdxml
42
+ @table = @data.element_children.each_with_object([]) do |font, table|
43
+ next if font.name != "font"
44
+
45
+ id = font.attr("id").to_i
46
+ charset = font.attr("charset")
47
+ name = font.attr("name")
48
+
49
+ table.push(id: id, charset: charset, name: name)
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,149 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ module ChemDraw
5
+ # CDX Fragment parser
6
+ class Fragment < BaseNode
7
+ require "chem_scanner/chem_draw/node/fragment_node"
8
+ require "chem_scanner/chem_draw/node/bond"
9
+
10
+ attr_accessor :boxed, # indicate if fragment is boxed within an rectangle
11
+ :polygon, :node_map, :bond_map, :graphic_map
12
+
13
+ def initialize(parser, parser_type, id)
14
+ super(parser, parser_type, id)
15
+ @boxed = false
16
+
17
+ @node_map = {}
18
+ @bond_map = {}
19
+
20
+ @graphic_map = {}
21
+ end
22
+
23
+ def parse_node(tag, nid, _data)
24
+ case @props_ref.key?(tag) || @obj_ref[tag]
25
+ when "Node" then create_node(nid)
26
+ when "Bond" then create_bond(nid)
27
+ when "Graphic"
28
+ graphic = Graphic.new(@parser, @parser_type, id)
29
+ graphic.read
30
+ @graphic_map[id] = graphic
31
+ # when "BoundingBox" then @polygon = read_value(tag, data)
32
+
33
+ # NOTE: Indicates that this object represents some properties
34
+ # in some other objects.
35
+ # when "RepresentsProperty" then @represent = true
36
+ else do_unhandled(tag)
37
+ end
38
+ end
39
+
40
+ def post_parse_node
41
+ return if !@polygon.nil? || @node_map.count.zero?
42
+
43
+ rebuild_polygon
44
+ end
45
+
46
+ def rebuild_polygon
47
+ fn = @node_map.first[1]
48
+ lb = Geometry::Point.new(fn.x, fn.y)
49
+ rt = Geometry::Point.new(fn.x, fn.y)
50
+
51
+ @node_map.each_value do |node|
52
+ # next if node.x.nil? || node.y.nil?
53
+ next if node.has_nil_coord?
54
+
55
+ nlb = node.leftbottom
56
+ nrt = node.righttop
57
+
58
+ lb.x = nlb.x if nlb.x < lb.x
59
+ lb.y = nlb.y if nlb.y < lb.y
60
+
61
+ rt.x = nrt.x if nrt.x > rt.x
62
+ rt.y = nrt.y if nrt.y > rt.y
63
+ end
64
+
65
+ points = [
66
+ Geometry::Point.new(lb.x, lb.y),
67
+ Geometry::Point.new(lb.x, rt.y),
68
+ Geometry::Point.new(rt.x, rt.y),
69
+ Geometry::Point.new(rt.x, lb.y),
70
+ ]
71
+ @polygon = Geometry::Polygon.new(points)
72
+ end
73
+
74
+ def create_node(id)
75
+ node = FragmentNode.new(@parser, @parser_type, id)
76
+ node.read
77
+ @node_map[id] = node
78
+ end
79
+
80
+ def create_bond(id)
81
+ bond = Bond.new(@parser, @parser_type, id)
82
+ bond.read
83
+ @bond_map[id] = bond
84
+ end
85
+
86
+ def get_node_with_type(type)
87
+ @node_map.select { |_, v| v.type == type }
88
+ end
89
+
90
+ # Check if fragment has ExternalConnectionPoint node
91
+ def get_external_point
92
+ get_node_with_type(12)
93
+ end
94
+
95
+ # Get the internal id for Fragment node
96
+ def get_internal_nids
97
+ ext_node = get_external_point
98
+ return [] if ext_node.count.zero?
99
+
100
+ ext_ids = ext_node.keys
101
+ internal_ids = []
102
+ ext_ids.each do |ext_id|
103
+ hbond = bond_has_endpoint(ext_id)
104
+ _, bond = hbond
105
+
106
+ internal_ids.push(bond.other_endpoint(ext_id))
107
+ end
108
+ [ext_ids, internal_ids]
109
+ end
110
+
111
+ def bond_has_endpoint(endpoint)
112
+ @bond_map.detect { |_, b| b.end_points.include?(endpoint) }
113
+ end
114
+
115
+ def clone
116
+ cloned = self.class.new(@parser, @parser_type, @id)
117
+ cloned.boxed = @boxed
118
+ cloned.clone_node_map(@node_map)
119
+ cloned.clone_bond_map(@bond_map)
120
+
121
+ cloned
122
+ end
123
+
124
+ def clone_node_map(node_map)
125
+ @node_map = {}
126
+ node_map.each do |k, v|
127
+ @node_map[k] = v
128
+ end
129
+ end
130
+
131
+ def clone_bond_map(bond_map)
132
+ @bond_map = {}
133
+ bond_map.each do |k, v|
134
+ @bond_map[k] = v
135
+ end
136
+ end
137
+
138
+ def set_new_id
139
+ new_id = @parser.get_tempid
140
+ set_id(new_id)
141
+ new_id
142
+ end
143
+
144
+ def set_id(new_id)
145
+ @id = new_id
146
+ end
147
+ end
148
+ end
149
+ end