chem_scanner 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +13 -0
- data/.rspec +3 -0
- data/.rubocop.yml +604 -0
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/.travis.yml +5 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +20 -0
- data/LICENSE.txt +661 -0
- data/README.md +177 -0
- data/Rakefile +8 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/chem_scanner.gemspec +43 -0
- data/lib/chem_scanner.rb +79 -0
- data/lib/chem_scanner/cdx.rb +67 -0
- data/lib/chem_scanner/cdxml.rb +72 -0
- data/lib/chem_scanner/chem_draw/cdx_reader.rb +101 -0
- data/lib/chem_scanner/chem_draw/node/base_node.rb +123 -0
- data/lib/chem_scanner/chem_draw/node/base_value.rb +257 -0
- data/lib/chem_scanner/chem_draw/node/bond.rb +100 -0
- data/lib/chem_scanner/chem_draw/node/bracket_attachment.rb +17 -0
- data/lib/chem_scanner/chem_draw/node/bracket_group.rb +32 -0
- data/lib/chem_scanner/chem_draw/node/chem_geometry.rb +58 -0
- data/lib/chem_scanner/chem_draw/node/color_table.rb +46 -0
- data/lib/chem_scanner/chem_draw/node/font_table.rb +54 -0
- data/lib/chem_scanner/chem_draw/node/fragment.rb +149 -0
- data/lib/chem_scanner/chem_draw/node/fragment_node.rb +145 -0
- data/lib/chem_scanner/chem_draw/node/graphic.rb +94 -0
- data/lib/chem_scanner/chem_draw/node/text.rb +242 -0
- data/lib/chem_scanner/chem_draw/parser.rb +214 -0
- data/lib/chem_scanner/chem_draw/yaml/cdx_objects.yaml +32 -0
- data/lib/chem_scanner/chem_draw/yaml/cdx_props.yaml +263 -0
- data/lib/chem_scanner/chem_draw/yaml/cdxml_objects.yaml +36 -0
- data/lib/chem_scanner/chem_draw/yaml/cdxml_props.yaml +263 -0
- data/lib/chem_scanner/chem_draw/yaml/props_data_type.yaml +263 -0
- data/lib/chem_scanner/configuration/abbreviation.rb +76 -0
- data/lib/chem_scanner/configuration/superatom.rb +76 -0
- data/lib/chem_scanner/configuration/superatom.txt +2874 -0
- data/lib/chem_scanner/configuration/util.rb +40 -0
- data/lib/chem_scanner/configuration/yaml/abbreviations.yaml +6399 -0
- data/lib/chem_scanner/configuration/yaml/elements.yaml +115 -0
- data/lib/chem_scanner/configuration/yaml/solvents.yaml +16 -0
- data/lib/chem_scanner/doc.rb +56 -0
- data/lib/chem_scanner/docx.rb +86 -0
- data/lib/chem_scanner/export/cml.rb +176 -0
- data/lib/chem_scanner/extension/element_map.rb +9 -0
- data/lib/chem_scanner/extension/geometry/bounding_box.rb +84 -0
- data/lib/chem_scanner/extension/geometry/line.rb +123 -0
- data/lib/chem_scanner/extension/geometry/point.rb +18 -0
- data/lib/chem_scanner/extension/geometry/polygon.rb +115 -0
- data/lib/chem_scanner/extension/geometry/segment.rb +196 -0
- data/lib/chem_scanner/extension/passthrough.rb +7 -0
- data/lib/chem_scanner/interpreter/element/arrow.rb +298 -0
- data/lib/chem_scanner/interpreter/element/atom.rb +134 -0
- data/lib/chem_scanner/interpreter/element/fragment.rb +59 -0
- data/lib/chem_scanner/interpreter/element/molecule.rb +473 -0
- data/lib/chem_scanner/interpreter/element/molecule_group.rb +34 -0
- data/lib/chem_scanner/interpreter/element/reaction.rb +186 -0
- data/lib/chem_scanner/interpreter/element/reaction_step.rb +39 -0
- data/lib/chem_scanner/interpreter/formula_to_mol.rb +75 -0
- data/lib/chem_scanner/interpreter/post_process/assemble.rb +38 -0
- data/lib/chem_scanner/interpreter/post_process/label_by_molecule.rb +37 -0
- data/lib/chem_scanner/interpreter/post_process/reaction_info.rb +225 -0
- data/lib/chem_scanner/interpreter/post_process/reaction_step.rb +95 -0
- data/lib/chem_scanner/interpreter/post_process/reagent_label.rb +46 -0
- data/lib/chem_scanner/interpreter/post_process/text_as_molecule.rb +52 -0
- data/lib/chem_scanner/interpreter/post_process/text_label.rb +40 -0
- data/lib/chem_scanner/interpreter/pre_process/arrow.rb +197 -0
- data/lib/chem_scanner/interpreter/pre_process/graphic.rb +41 -0
- data/lib/chem_scanner/interpreter/pre_process/molecule.rb +150 -0
- data/lib/chem_scanner/interpreter/reaction_detection/assign_to_reaction.rb +129 -0
- data/lib/chem_scanner/interpreter/reaction_detection/duplicate_reagents.rb +50 -0
- data/lib/chem_scanner/interpreter/reaction_detection/molecule_group.rb +55 -0
- data/lib/chem_scanner/interpreter/reaction_detection/multi_line_chain_reaction.rb +85 -0
- data/lib/chem_scanner/interpreter/reaction_detection/remove_separated_mol.rb +115 -0
- data/lib/chem_scanner/interpreter/reaction_detection/text_assignment.rb +166 -0
- data/lib/chem_scanner/interpreter/scheme.rb +173 -0
- data/lib/chem_scanner/interpreter/scheme_base.rb +64 -0
- data/lib/chem_scanner/interpreter/text_group/bold_groups.rb +183 -0
- data/lib/chem_scanner/interpreter/text_group/molecule_text_group.rb +138 -0
- data/lib/chem_scanner/interpreter/text_group/reaction_text_groups.rb +221 -0
- data/lib/chem_scanner/interpreter/text_group/retrieve_alias_info.rb +41 -0
- data/lib/chem_scanner/interpreter/text_group/retrieve_n_atoms.rb +106 -0
- data/lib/chem_scanner/interpreter/text_group/text_group_interpreter.rb +92 -0
- data/lib/chem_scanner/perkin_eln.rb +287 -0
- data/lib/chem_scanner/version.rb +5 -0
- data/lib/rubygems_plugin.rb +5 -0
- metadata +244 -0
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module ChemScanner
|
|
4
|
+
module Interpreter
|
|
5
|
+
# MoleculeGroup - molecules represented as text
|
|
6
|
+
class MoleculeGroup
|
|
7
|
+
attr_accessor :title
|
|
8
|
+
attr_reader :polygon, :molecules, :molecule_ids
|
|
9
|
+
|
|
10
|
+
def initialize(title = nil)
|
|
11
|
+
@title = title
|
|
12
|
+
@molecules = []
|
|
13
|
+
@molecule_ids = []
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def add_fragment(fragment)
|
|
17
|
+
mol = Molecule.new(fragment)
|
|
18
|
+
mol.process
|
|
19
|
+
mol.abbreviation = title.value
|
|
20
|
+
molecules.push(mol)
|
|
21
|
+
@molecule_ids.push(fragment.id)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def inspect
|
|
25
|
+
(
|
|
26
|
+
"#<MoleculeGroup: id=#{@title.id}, " +
|
|
27
|
+
"text: #{@title}, " +
|
|
28
|
+
"molecule_ids: #{@molecule_ids}, " +
|
|
29
|
+
"molecules: #{@molecules} >"
|
|
30
|
+
)
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module ChemScanner
|
|
4
|
+
module Interpreter
|
|
5
|
+
# Reaction
|
|
6
|
+
class Reaction
|
|
7
|
+
attr_accessor :reactant_ids, :reagent_ids, :product_ids,
|
|
8
|
+
:text_ids, :arrow_id, :arrow,
|
|
9
|
+
:reactants, :products, :reagents,
|
|
10
|
+
:reagent_smiles, :reagent_abbs,
|
|
11
|
+
:description, :temperature, :yield, :time,
|
|
12
|
+
:steps, :details, :clone_from
|
|
13
|
+
|
|
14
|
+
def initialize
|
|
15
|
+
@arrow = nil
|
|
16
|
+
|
|
17
|
+
@reactant_ids = []
|
|
18
|
+
@reagent_ids = []
|
|
19
|
+
@product_ids = []
|
|
20
|
+
@text_ids = []
|
|
21
|
+
|
|
22
|
+
@reactants = []
|
|
23
|
+
@reagents = []
|
|
24
|
+
@products = []
|
|
25
|
+
@reagent_smiles = []
|
|
26
|
+
@reagent_abbs = []
|
|
27
|
+
|
|
28
|
+
@description = ""
|
|
29
|
+
@temperature = ""
|
|
30
|
+
@yield = ""
|
|
31
|
+
@time = ""
|
|
32
|
+
@details = OpenStruct.new
|
|
33
|
+
|
|
34
|
+
@steps = []
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def reaction_smiles
|
|
38
|
+
reactant_smiles = @reactants.map(&:cano_smiles).join(".")
|
|
39
|
+
product_smiles = @products.map(&:cano_smiles).join(".")
|
|
40
|
+
|
|
41
|
+
reagent_smiles = @reagents.map(&:cano_smiles).compact
|
|
42
|
+
reagent_smiles = reagent_smiles.concat(@reagent_smiles).join(".")
|
|
43
|
+
|
|
44
|
+
"#{reactant_smiles}>#{reagent_smiles}>#{product_smiles}"
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def reactant_molfiles
|
|
48
|
+
@reactants.map { |r| r[:mdl] }
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def reagent_molfiles
|
|
52
|
+
@reagents.map { |r| r[:mdl] }
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def product_molfiles
|
|
56
|
+
@products.map { |r| r[:mdl] }
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def debug_print
|
|
60
|
+
"reaction #{@arrow_id}: "\
|
|
61
|
+
"#{reactant_ids} > #{reagent_ids} > #{product_ids}"
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def debug_print_smiles
|
|
65
|
+
"reaction #{@arrow_id}: "\
|
|
66
|
+
"#{reactant_ids} - #{reagent_ids} - #{product_ids}: #{reaction_smiles}"
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def molecule_ids
|
|
70
|
+
@reactant_ids + @product_ids
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def all_ids
|
|
74
|
+
@reagent_ids + molecule_ids
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def delete_id(id)
|
|
78
|
+
[@reactant_ids, @reagent_ids, @product_ids].each do |group|
|
|
79
|
+
group.delete(id) if group.include?(id)
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def replace_id(old_id, new_id)
|
|
84
|
+
[@reactant_ids, @reagent_ids, @product_ids].each do |group|
|
|
85
|
+
next unless group.include?(old_id)
|
|
86
|
+
|
|
87
|
+
group.delete(old_id)
|
|
88
|
+
group.push(new_id)
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def replace_molecule(old_id, new_mol)
|
|
93
|
+
oid = old_id
|
|
94
|
+
|
|
95
|
+
[@reactants, @reagents, @products].each do |group|
|
|
96
|
+
idx = group.index { |m| [m.id, m.clone_from].include?(old_id) }
|
|
97
|
+
next if idx.nil?
|
|
98
|
+
|
|
99
|
+
m = group[idx]
|
|
100
|
+
oid = m.clone_from unless m.clone_from.nil?
|
|
101
|
+
group[idx] = new_mol
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
replace_id(oid, new_mol.id)
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def delete_molecule_by_id(id)
|
|
108
|
+
[@reactants, @reagents, @products].each do |group|
|
|
109
|
+
group.delete_if { |mol| mol.id == id }
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def status
|
|
114
|
+
return "Failed" if @arrow.cross
|
|
115
|
+
return "Planned" if @arrow.line_type == 1
|
|
116
|
+
|
|
117
|
+
return "Failed" unless @products.detect(&:check_red).nil?
|
|
118
|
+
|
|
119
|
+
"Succesful"
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
def clone
|
|
123
|
+
cloned = self.class.new
|
|
124
|
+
unless @arrow.nil?
|
|
125
|
+
cloned.arrow_id = @arrow.get_tempid
|
|
126
|
+
cloned.arrow = @arrow.clone
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
%w[reactant reagent product].each do |group|
|
|
130
|
+
cloned_groups = cloned.send("#{group}s")
|
|
131
|
+
groups = instance_variable_get("@#{group}s")
|
|
132
|
+
|
|
133
|
+
groups.each { |m| cloned_groups.push(m.clone) }
|
|
134
|
+
cloned.send("#{group}_ids=", cloned_groups.map(&:id))
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
cloned.reagent_smiles = @reagent_smiles.dup
|
|
138
|
+
|
|
139
|
+
cloned.description = @description.dup
|
|
140
|
+
cloned.temperature = @temperature.dup
|
|
141
|
+
cloned.yield = @yield.dup
|
|
142
|
+
cloned.time = @time.dup
|
|
143
|
+
cloned.details = @details.dup
|
|
144
|
+
|
|
145
|
+
cloned.clone_from = @clone_from.nil? ? arrow_id : @clone_from
|
|
146
|
+
|
|
147
|
+
cloned
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
def to_hash
|
|
151
|
+
{
|
|
152
|
+
id: arrow_id,
|
|
153
|
+
reactants: @reactants.sort_by(&:cano_smiles).map(&:to_hash),
|
|
154
|
+
reagents: @reagents.sort_by(&:cano_smiles).map(&:to_hash),
|
|
155
|
+
products: @products.sort_by(&:cano_smiles).map(&:to_hash),
|
|
156
|
+
steps: @steps.map(&:to_hash),
|
|
157
|
+
reagent_smiles: reagent_smiles.sort,
|
|
158
|
+
description: @description,
|
|
159
|
+
temperature: @temperature,
|
|
160
|
+
yield: @yield,
|
|
161
|
+
time: @time,
|
|
162
|
+
details: @details.to_h,
|
|
163
|
+
}
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
def inspect
|
|
167
|
+
(
|
|
168
|
+
"#<Reaction: id=#{@arrow.id}, " +
|
|
169
|
+
"reactant_ids=#{@reactant_ids}, " +
|
|
170
|
+
"reagent_ids=#{@reagent_ids}, " +
|
|
171
|
+
"product_ids=#{@product_ids}, " +
|
|
172
|
+
"text_ids=#{@text_ids}, " +
|
|
173
|
+
"reactants=#{@reactants}, " +
|
|
174
|
+
"reagents=#{@reagents}, " +
|
|
175
|
+
"products=#{@products}, " +
|
|
176
|
+
"reagent_smiles=#{@reagent_smiles}, " +
|
|
177
|
+
"description=#{@description}, " +
|
|
178
|
+
"temperature=#{@temperature}, " +
|
|
179
|
+
"yield=#{@yield}, " +
|
|
180
|
+
"time=#{@time}, " +
|
|
181
|
+
"details=#{@details} >"
|
|
182
|
+
)
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module ChemScanner
|
|
4
|
+
module Interpreter
|
|
5
|
+
# Reaction Step
|
|
6
|
+
class ReactionStep
|
|
7
|
+
attr_accessor :description, :time, :temperature, :reagents, :number
|
|
8
|
+
|
|
9
|
+
def initialize
|
|
10
|
+
@number = 0
|
|
11
|
+
@description = ""
|
|
12
|
+
@time = ""
|
|
13
|
+
@temperature = ""
|
|
14
|
+
|
|
15
|
+
@reagents = []
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def inspect
|
|
19
|
+
(
|
|
20
|
+
"#<ReactionStep: description=#{@description}, " +
|
|
21
|
+
"number=#{@number}, " +
|
|
22
|
+
"time=#{@time}, " +
|
|
23
|
+
"temperature=#{@temperature}, " +
|
|
24
|
+
"reagents=#{@reagents}"
|
|
25
|
+
)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def to_hash
|
|
29
|
+
{
|
|
30
|
+
number: @number,
|
|
31
|
+
description: @description,
|
|
32
|
+
time: @time,
|
|
33
|
+
temperature: @temperature,
|
|
34
|
+
reagents: @reagents,
|
|
35
|
+
}
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# coding: utf-8
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
module ChemScanner
|
|
5
|
+
module Interpreter
|
|
6
|
+
OPEN_MARK = '[\(\[\{]'.freeze
|
|
7
|
+
CLOSE_MARK = '[\)\]\}]'.freeze
|
|
8
|
+
|
|
9
|
+
# NOTE: WIP file
|
|
10
|
+
def mol_from_inorganic_formula(text)
|
|
11
|
+
return nil unless text.class == String
|
|
12
|
+
|
|
13
|
+
string = text.dup
|
|
14
|
+
iter = string =~ /#{OPEN_MARK}/
|
|
15
|
+
return parse_formula(text) if iter.nil?
|
|
16
|
+
|
|
17
|
+
reverse_string = string.reverse
|
|
18
|
+
reverse_iter = reverse_string =~ /#{CLOSE_MARK}/
|
|
19
|
+
|
|
20
|
+
math_data = text.match(formula_regex)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def parse_formula(formula, out_valence = 0)
|
|
24
|
+
# NOTE: sort alphabetically then by length,
|
|
25
|
+
# so that C will not be catched first in Ca
|
|
26
|
+
el_names = ELEMENTS.map { |x| x["name"] }
|
|
27
|
+
els = el_names.sort_by { |a| [a[0], -a.size] }.join("|")
|
|
28
|
+
num = "[1-9]"
|
|
29
|
+
charge = "[-+]"
|
|
30
|
+
return nil unless formula.split(/#{els}|#{num}|#{charge}/).empty?
|
|
31
|
+
|
|
32
|
+
el_arr = formula.scan(/(#{els})(#{num}{0,2})/).map do |el, elnum|
|
|
33
|
+
el_info = ELEMENTS.detect { |e| e["name"] == el }
|
|
34
|
+
return nil if el_info.nil? || el_info["valences"][2].first.zero?
|
|
35
|
+
|
|
36
|
+
{
|
|
37
|
+
name: el,
|
|
38
|
+
num: elnum.empty? ? 1 : elnum.to_i,
|
|
39
|
+
valences: el_info["valences"][2],
|
|
40
|
+
}
|
|
41
|
+
end
|
|
42
|
+
return nil if el_arr.size == 1
|
|
43
|
+
|
|
44
|
+
# el_arr.sort_by! { |el| el[:valences].max }
|
|
45
|
+
fel = el_arr.first
|
|
46
|
+
others = el_arr[1..-1]
|
|
47
|
+
|
|
48
|
+
valence_combination = []
|
|
49
|
+
idx_map = others.map { |el| el[:valences].count - 1 }
|
|
50
|
+
|
|
51
|
+
fel[:valences].each do |fvalen|
|
|
52
|
+
idx_iter = Array.new(idx_map.size, 0)
|
|
53
|
+
iter = idx_iter.size - 1
|
|
54
|
+
stop = false
|
|
55
|
+
|
|
56
|
+
until stop do
|
|
57
|
+
vasum = idx_iter.reduce(0) do |sum, idx|
|
|
58
|
+
el_valence = others[idx][:valences]
|
|
59
|
+
cur_val = idx_iter[idx]
|
|
60
|
+
sum += el_valence[cur_val]
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
valence_combination.push(idx_iter) if (vasum + fvalen) == out_valence
|
|
64
|
+
|
|
65
|
+
if idx_iter[iter] == idx_map[iter]
|
|
66
|
+
stope = true if iter.zero?
|
|
67
|
+
iter -= 1
|
|
68
|
+
else
|
|
69
|
+
idx_iter[iter] += 1
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module ChemScanner
|
|
4
|
+
# Interpret the parsed/extracted geometry block
|
|
5
|
+
module Interpreter
|
|
6
|
+
class Scheme
|
|
7
|
+
def assemble_reaction
|
|
8
|
+
@reactions.each do |r|
|
|
9
|
+
%w[reactant reagent product].each do |group|
|
|
10
|
+
group_ids = r.send("#{group}_ids")
|
|
11
|
+
groups = r.send("#{group}s")
|
|
12
|
+
|
|
13
|
+
group_ids.each do |id|
|
|
14
|
+
if @text_map.key?(id)
|
|
15
|
+
r.text_ids.push(id)
|
|
16
|
+
next
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
if @mol_map.key?(id)
|
|
20
|
+
groups.push(@mol_map[id])
|
|
21
|
+
next
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
@mol_group_map.select do |_, mgroup|
|
|
25
|
+
mgroup.molecule_ids.include?(id)
|
|
26
|
+
end.each do |_, mgroup|
|
|
27
|
+
groups.push(mgroup.molecules.first)
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
r.arrow = @arrow_map[r.arrow_id]
|
|
33
|
+
r.text_ids.concat(r.arrow.text_arr).uniq!
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module ChemScanner
|
|
4
|
+
module Interpreter
|
|
5
|
+
using Extension
|
|
6
|
+
|
|
7
|
+
module PostProcess
|
|
8
|
+
def replace_label_by_molecule
|
|
9
|
+
@reactions.each do |r|
|
|
10
|
+
@arrow_map[r.arrow_id].text_arr.each do |tid|
|
|
11
|
+
text = @text_map[tid]
|
|
12
|
+
|
|
13
|
+
bolds = text.bold_text.strip.split(ABB_DELIM).reject(&:empty?)
|
|
14
|
+
bolds.each do |bold|
|
|
15
|
+
mol = @mol_map.detect { |_, m| m.label == bold }
|
|
16
|
+
next if mol.nil?
|
|
17
|
+
|
|
18
|
+
mid = mol[0]
|
|
19
|
+
r.reagent_ids.push(mid) unless r.reagent_ids.include?(mid)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
non_bolds = text.non_bold_text.strip.split(ABB_DELIM)
|
|
23
|
+
non_bolds.reject(&:empty?).each do |plain|
|
|
24
|
+
next if plain.length < 3 || !(plain =~ /eq(uiv)?\.?/).nil?
|
|
25
|
+
|
|
26
|
+
mol = @mol_map.detect { |_, m| m.text.strip == plain.strip }
|
|
27
|
+
next if mol.nil?
|
|
28
|
+
|
|
29
|
+
mid = mol[0]
|
|
30
|
+
r.reagent_ids.push(mid) unless r.reagent_ids.include?(mid)
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
# coding: utf-8
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
module ChemScanner
|
|
5
|
+
# Interpreter of extracted/scanned information
|
|
6
|
+
module Interpreter
|
|
7
|
+
using Extension
|
|
8
|
+
|
|
9
|
+
JOIN_WORDS = %w[and with plus].freeze
|
|
10
|
+
|
|
11
|
+
START_REGEX = '(?<=\s|,|;|\n|\r|\[|\(|\.|\A|^)+'
|
|
12
|
+
ENDING_REGEX = '(?=\s|,|;|\n|\r|\]|\)|\.|\z|$)+'
|
|
13
|
+
|
|
14
|
+
DEGREE_REGEX = '((°\s*[CF])|(℃|℉))'
|
|
15
|
+
RANGE_REGEX = "(-|−|–|—|~|to|till|until)"
|
|
16
|
+
|
|
17
|
+
module PostProcess
|
|
18
|
+
def process_reaction_info(reaction)
|
|
19
|
+
descs = []
|
|
20
|
+
reaction.text_ids.each do |tid|
|
|
21
|
+
text_obj = @text_map[tid]
|
|
22
|
+
text = text_obj.value
|
|
23
|
+
descs.push(text)
|
|
24
|
+
|
|
25
|
+
mgroup = @mol_group_map[tid]
|
|
26
|
+
if mgroup.nil?
|
|
27
|
+
abb_mol = name_to_struct(text)
|
|
28
|
+
reaction.reagent_smiles.concat(abb_mol.values)
|
|
29
|
+
reaction.reagent_abbs.concat(abb_mol.keys)
|
|
30
|
+
else
|
|
31
|
+
mtext = mgroup.title.value
|
|
32
|
+
|
|
33
|
+
if mtext == text
|
|
34
|
+
merge_chemdraw_with_predefined(mgroup, reaction)
|
|
35
|
+
else
|
|
36
|
+
descs.push(mtext)
|
|
37
|
+
reaction.reagents.concat(mgroup.molecules)
|
|
38
|
+
|
|
39
|
+
abb_mol = name_to_struct(mtext)
|
|
40
|
+
reaction.reagent_smiles.concat(abb_mol.values)
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
temperature, ryield, time = extract_reaction_info(descs)
|
|
46
|
+
pyield = extract_product_yield(reaction)
|
|
47
|
+
|
|
48
|
+
reaction.temperature = temperature
|
|
49
|
+
reaction.yield = pyield.empty? ? ryield : pyield
|
|
50
|
+
reaction.time = time
|
|
51
|
+
reaction.description = descs.reject { |e| e.to_s.empty? }.join("\n")
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def split_text(text)
|
|
55
|
+
text.split(ABB_DELIM).select { |t| t.length > 1 }
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def name_to_struct(text)
|
|
59
|
+
smis = {}
|
|
60
|
+
remain = []
|
|
61
|
+
text_arr = split_text(text)
|
|
62
|
+
|
|
63
|
+
text_arr.each do |t|
|
|
64
|
+
smi = ChemScanner.get_abbreviation(t)
|
|
65
|
+
|
|
66
|
+
if smi.empty?
|
|
67
|
+
remain.push(t)
|
|
68
|
+
else
|
|
69
|
+
smis[t] = smi
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
unless remain.empty?
|
|
74
|
+
tmp = remain.join(" ")
|
|
75
|
+
|
|
76
|
+
ChemScanner.all_abbreviations.keys.select do |key|
|
|
77
|
+
key.include?(" ")
|
|
78
|
+
end.each do |abb|
|
|
79
|
+
next unless tmp.include?(abb)
|
|
80
|
+
|
|
81
|
+
tmp.slice!(abb)
|
|
82
|
+
smis[abb] = ChemScanner.get_abbreviation(abb)
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
smis
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def merge_chemdraw_with_predefined(mgroup, reaction)
|
|
90
|
+
mtext = mgroup.title.value
|
|
91
|
+
abb_hash = name_to_struct(mtext)
|
|
92
|
+
|
|
93
|
+
text_arr = split_text(mtext)
|
|
94
|
+
text_arr.each_with_index do |text, idx|
|
|
95
|
+
abb_smi = abb_hash[text]
|
|
96
|
+
|
|
97
|
+
if abb_smi.nil?
|
|
98
|
+
mol = mgroup.molecules[idx]
|
|
99
|
+
reaction.reagents.push(mol) unless mol.nil?
|
|
100
|
+
else
|
|
101
|
+
reaction.reagent_smiles.push(abb_smi)
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def extract_reaction_info(descs)
|
|
107
|
+
ryield = []
|
|
108
|
+
temperatures = []
|
|
109
|
+
times = []
|
|
110
|
+
|
|
111
|
+
descs.each do |desc|
|
|
112
|
+
dyield = extract_yield_info(desc)
|
|
113
|
+
ryield.push(dyield) unless dyield.empty?
|
|
114
|
+
|
|
115
|
+
temp = extract_temperature(desc)
|
|
116
|
+
temperatures.push(temp) unless temp.empty?
|
|
117
|
+
|
|
118
|
+
time = extract_time_info(desc)
|
|
119
|
+
times.push(time) unless time.empty?
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
[
|
|
123
|
+
temperatures.join(";"),
|
|
124
|
+
ryield.join(";"),
|
|
125
|
+
times.join(";"),
|
|
126
|
+
]
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def extract_product_yield(reaction)
|
|
130
|
+
pyields = []
|
|
131
|
+
|
|
132
|
+
reaction.products.each do |mol|
|
|
133
|
+
next if mol.text.strip.empty?
|
|
134
|
+
|
|
135
|
+
pyield = extract_yield_info(mol.text.strip)
|
|
136
|
+
pyields.push(pyield)
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
pyields.join(";")
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def range_number_regex(unit_regex, can_negative)
|
|
143
|
+
sign = can_negative ? "(-|−|–|—)?\\s*" : ""
|
|
144
|
+
real_number = "(\\d+|\\d+\.\\d+)"
|
|
145
|
+
|
|
146
|
+
"#{sign}(#{real_number}\\s*#{unit_regex}?\\s*" \
|
|
147
|
+
"#{RANGE_REGEX})?#{real_number}\\s*#{unit_regex}"
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
def time_duration_range_regex
|
|
151
|
+
day = "days?|dy|d"
|
|
152
|
+
hour = "hours?|hrs?|h"
|
|
153
|
+
minute = "minutes?|mins?|m"
|
|
154
|
+
second = "seconds?|secs?|s"
|
|
155
|
+
real_number = '(\d+|\d+\.\d+)'
|
|
156
|
+
|
|
157
|
+
time_unit = "(#{day}|#{hour}|#{minute}|#{second})"
|
|
158
|
+
time_regex = "#{real_number}\\s*#{time_unit}"
|
|
159
|
+
join_words = JOIN_WORDS.join("|")
|
|
160
|
+
linker_regex = "(#{RANGE_REGEX}|(#{join_words}))"
|
|
161
|
+
|
|
162
|
+
%r{
|
|
163
|
+
#{START_REGEX}
|
|
164
|
+
(#{time_regex}?\s*(#{linker_regex}\s*)?(#{real_number}\s*#{time_unit}))
|
|
165
|
+
#{ENDING_REGEX}
|
|
166
|
+
}x
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
def extract_yield_info(text)
|
|
170
|
+
yield_regex_str = range_number_regex("%", false)
|
|
171
|
+
yield_regex = %r{
|
|
172
|
+
#{START_REGEX}
|
|
173
|
+
#{yield_regex_str}(?!\s*ee)
|
|
174
|
+
#{ENDING_REGEX}
|
|
175
|
+
}x
|
|
176
|
+
|
|
177
|
+
text_regex(text, yield_regex)
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def extract_time_info(text)
|
|
181
|
+
time = []
|
|
182
|
+
text.scan(time_duration_range_regex) { |m| time << m[0] }
|
|
183
|
+
|
|
184
|
+
ovn_regex = "overnight|ovn|o/n"
|
|
185
|
+
ovn_regex = %r{
|
|
186
|
+
#{START_REGEX}
|
|
187
|
+
(#{ovn_regex}?)
|
|
188
|
+
#{ENDING_REGEX}
|
|
189
|
+
}xi
|
|
190
|
+
ovn = text_regex(text, ovn_regex)
|
|
191
|
+
time.push("12h ~ 20h") unless ovn.empty?
|
|
192
|
+
|
|
193
|
+
time.join(";")
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
def extract_temperature(text)
|
|
197
|
+
temp_regex_str = range_number_regex(DEGREE_REGEX, true)
|
|
198
|
+
temperature_regex = %r{
|
|
199
|
+
#{START_REGEX}
|
|
200
|
+
#{temp_regex_str}
|
|
201
|
+
#{ENDING_REGEX}
|
|
202
|
+
}x
|
|
203
|
+
temp = text_regex(text, temperature_regex)
|
|
204
|
+
|
|
205
|
+
rt_regex = %r{
|
|
206
|
+
#{START_REGEX}
|
|
207
|
+
r\.?t\.?
|
|
208
|
+
#{ENDING_REGEX}
|
|
209
|
+
}xi
|
|
210
|
+
m = text.match(rt_regex)
|
|
211
|
+
return temp if m.nil? || m[0].empty?
|
|
212
|
+
|
|
213
|
+
rt = "20°C ~ 25°C"
|
|
214
|
+
temp.empty? ? rt : "#{temp}; #{rt}"
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
def text_regex(text, regex)
|
|
218
|
+
m = text.match(regex)
|
|
219
|
+
return "" if m.nil?
|
|
220
|
+
|
|
221
|
+
m[0].strip
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
end
|
|
225
|
+
end
|