chem_scanner 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +13 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yml +604 -0
  5. data/.ruby-gemset +1 -0
  6. data/.ruby-version +1 -0
  7. data/.travis.yml +5 -0
  8. data/CODE_OF_CONDUCT.md +74 -0
  9. data/Gemfile +20 -0
  10. data/LICENSE.txt +661 -0
  11. data/README.md +177 -0
  12. data/Rakefile +8 -0
  13. data/bin/console +14 -0
  14. data/bin/setup +8 -0
  15. data/chem_scanner.gemspec +43 -0
  16. data/lib/chem_scanner.rb +79 -0
  17. data/lib/chem_scanner/cdx.rb +67 -0
  18. data/lib/chem_scanner/cdxml.rb +72 -0
  19. data/lib/chem_scanner/chem_draw/cdx_reader.rb +101 -0
  20. data/lib/chem_scanner/chem_draw/node/base_node.rb +123 -0
  21. data/lib/chem_scanner/chem_draw/node/base_value.rb +257 -0
  22. data/lib/chem_scanner/chem_draw/node/bond.rb +100 -0
  23. data/lib/chem_scanner/chem_draw/node/bracket_attachment.rb +17 -0
  24. data/lib/chem_scanner/chem_draw/node/bracket_group.rb +32 -0
  25. data/lib/chem_scanner/chem_draw/node/chem_geometry.rb +58 -0
  26. data/lib/chem_scanner/chem_draw/node/color_table.rb +46 -0
  27. data/lib/chem_scanner/chem_draw/node/font_table.rb +54 -0
  28. data/lib/chem_scanner/chem_draw/node/fragment.rb +149 -0
  29. data/lib/chem_scanner/chem_draw/node/fragment_node.rb +145 -0
  30. data/lib/chem_scanner/chem_draw/node/graphic.rb +94 -0
  31. data/lib/chem_scanner/chem_draw/node/text.rb +242 -0
  32. data/lib/chem_scanner/chem_draw/parser.rb +214 -0
  33. data/lib/chem_scanner/chem_draw/yaml/cdx_objects.yaml +32 -0
  34. data/lib/chem_scanner/chem_draw/yaml/cdx_props.yaml +263 -0
  35. data/lib/chem_scanner/chem_draw/yaml/cdxml_objects.yaml +36 -0
  36. data/lib/chem_scanner/chem_draw/yaml/cdxml_props.yaml +263 -0
  37. data/lib/chem_scanner/chem_draw/yaml/props_data_type.yaml +263 -0
  38. data/lib/chem_scanner/configuration/abbreviation.rb +76 -0
  39. data/lib/chem_scanner/configuration/superatom.rb +76 -0
  40. data/lib/chem_scanner/configuration/superatom.txt +2874 -0
  41. data/lib/chem_scanner/configuration/util.rb +40 -0
  42. data/lib/chem_scanner/configuration/yaml/abbreviations.yaml +6399 -0
  43. data/lib/chem_scanner/configuration/yaml/elements.yaml +115 -0
  44. data/lib/chem_scanner/configuration/yaml/solvents.yaml +16 -0
  45. data/lib/chem_scanner/doc.rb +56 -0
  46. data/lib/chem_scanner/docx.rb +86 -0
  47. data/lib/chem_scanner/export/cml.rb +176 -0
  48. data/lib/chem_scanner/extension/element_map.rb +9 -0
  49. data/lib/chem_scanner/extension/geometry/bounding_box.rb +84 -0
  50. data/lib/chem_scanner/extension/geometry/line.rb +123 -0
  51. data/lib/chem_scanner/extension/geometry/point.rb +18 -0
  52. data/lib/chem_scanner/extension/geometry/polygon.rb +115 -0
  53. data/lib/chem_scanner/extension/geometry/segment.rb +196 -0
  54. data/lib/chem_scanner/extension/passthrough.rb +7 -0
  55. data/lib/chem_scanner/interpreter/element/arrow.rb +298 -0
  56. data/lib/chem_scanner/interpreter/element/atom.rb +134 -0
  57. data/lib/chem_scanner/interpreter/element/fragment.rb +59 -0
  58. data/lib/chem_scanner/interpreter/element/molecule.rb +473 -0
  59. data/lib/chem_scanner/interpreter/element/molecule_group.rb +34 -0
  60. data/lib/chem_scanner/interpreter/element/reaction.rb +186 -0
  61. data/lib/chem_scanner/interpreter/element/reaction_step.rb +39 -0
  62. data/lib/chem_scanner/interpreter/formula_to_mol.rb +75 -0
  63. data/lib/chem_scanner/interpreter/post_process/assemble.rb +38 -0
  64. data/lib/chem_scanner/interpreter/post_process/label_by_molecule.rb +37 -0
  65. data/lib/chem_scanner/interpreter/post_process/reaction_info.rb +225 -0
  66. data/lib/chem_scanner/interpreter/post_process/reaction_step.rb +95 -0
  67. data/lib/chem_scanner/interpreter/post_process/reagent_label.rb +46 -0
  68. data/lib/chem_scanner/interpreter/post_process/text_as_molecule.rb +52 -0
  69. data/lib/chem_scanner/interpreter/post_process/text_label.rb +40 -0
  70. data/lib/chem_scanner/interpreter/pre_process/arrow.rb +197 -0
  71. data/lib/chem_scanner/interpreter/pre_process/graphic.rb +41 -0
  72. data/lib/chem_scanner/interpreter/pre_process/molecule.rb +150 -0
  73. data/lib/chem_scanner/interpreter/reaction_detection/assign_to_reaction.rb +129 -0
  74. data/lib/chem_scanner/interpreter/reaction_detection/duplicate_reagents.rb +50 -0
  75. data/lib/chem_scanner/interpreter/reaction_detection/molecule_group.rb +55 -0
  76. data/lib/chem_scanner/interpreter/reaction_detection/multi_line_chain_reaction.rb +85 -0
  77. data/lib/chem_scanner/interpreter/reaction_detection/remove_separated_mol.rb +115 -0
  78. data/lib/chem_scanner/interpreter/reaction_detection/text_assignment.rb +166 -0
  79. data/lib/chem_scanner/interpreter/scheme.rb +173 -0
  80. data/lib/chem_scanner/interpreter/scheme_base.rb +64 -0
  81. data/lib/chem_scanner/interpreter/text_group/bold_groups.rb +183 -0
  82. data/lib/chem_scanner/interpreter/text_group/molecule_text_group.rb +138 -0
  83. data/lib/chem_scanner/interpreter/text_group/reaction_text_groups.rb +221 -0
  84. data/lib/chem_scanner/interpreter/text_group/retrieve_alias_info.rb +41 -0
  85. data/lib/chem_scanner/interpreter/text_group/retrieve_n_atoms.rb +106 -0
  86. data/lib/chem_scanner/interpreter/text_group/text_group_interpreter.rb +92 -0
  87. data/lib/chem_scanner/perkin_eln.rb +287 -0
  88. data/lib/chem_scanner/version.rb +5 -0
  89. data/lib/rubygems_plugin.rb +5 -0
  90. metadata +244 -0
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ module Interpreter
5
+ # MoleculeGroup - molecules represented as text
6
+ class MoleculeGroup
7
+ attr_accessor :title
8
+ attr_reader :polygon, :molecules, :molecule_ids
9
+
10
+ def initialize(title = nil)
11
+ @title = title
12
+ @molecules = []
13
+ @molecule_ids = []
14
+ end
15
+
16
+ def add_fragment(fragment)
17
+ mol = Molecule.new(fragment)
18
+ mol.process
19
+ mol.abbreviation = title.value
20
+ molecules.push(mol)
21
+ @molecule_ids.push(fragment.id)
22
+ end
23
+
24
+ def inspect
25
+ (
26
+ "#<MoleculeGroup: id=#{@title.id}, " +
27
+ "text: #{@title}, " +
28
+ "molecule_ids: #{@molecule_ids}, " +
29
+ "molecules: #{@molecules} >"
30
+ )
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,186 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ module Interpreter
5
+ # Reaction
6
+ class Reaction
7
+ attr_accessor :reactant_ids, :reagent_ids, :product_ids,
8
+ :text_ids, :arrow_id, :arrow,
9
+ :reactants, :products, :reagents,
10
+ :reagent_smiles, :reagent_abbs,
11
+ :description, :temperature, :yield, :time,
12
+ :steps, :details, :clone_from
13
+
14
+ def initialize
15
+ @arrow = nil
16
+
17
+ @reactant_ids = []
18
+ @reagent_ids = []
19
+ @product_ids = []
20
+ @text_ids = []
21
+
22
+ @reactants = []
23
+ @reagents = []
24
+ @products = []
25
+ @reagent_smiles = []
26
+ @reagent_abbs = []
27
+
28
+ @description = ""
29
+ @temperature = ""
30
+ @yield = ""
31
+ @time = ""
32
+ @details = OpenStruct.new
33
+
34
+ @steps = []
35
+ end
36
+
37
+ def reaction_smiles
38
+ reactant_smiles = @reactants.map(&:cano_smiles).join(".")
39
+ product_smiles = @products.map(&:cano_smiles).join(".")
40
+
41
+ reagent_smiles = @reagents.map(&:cano_smiles).compact
42
+ reagent_smiles = reagent_smiles.concat(@reagent_smiles).join(".")
43
+
44
+ "#{reactant_smiles}>#{reagent_smiles}>#{product_smiles}"
45
+ end
46
+
47
+ def reactant_molfiles
48
+ @reactants.map { |r| r[:mdl] }
49
+ end
50
+
51
+ def reagent_molfiles
52
+ @reagents.map { |r| r[:mdl] }
53
+ end
54
+
55
+ def product_molfiles
56
+ @products.map { |r| r[:mdl] }
57
+ end
58
+
59
+ def debug_print
60
+ "reaction #{@arrow_id}: "\
61
+ "#{reactant_ids} > #{reagent_ids} > #{product_ids}"
62
+ end
63
+
64
+ def debug_print_smiles
65
+ "reaction #{@arrow_id}: "\
66
+ "#{reactant_ids} - #{reagent_ids} - #{product_ids}: #{reaction_smiles}"
67
+ end
68
+
69
+ def molecule_ids
70
+ @reactant_ids + @product_ids
71
+ end
72
+
73
+ def all_ids
74
+ @reagent_ids + molecule_ids
75
+ end
76
+
77
+ def delete_id(id)
78
+ [@reactant_ids, @reagent_ids, @product_ids].each do |group|
79
+ group.delete(id) if group.include?(id)
80
+ end
81
+ end
82
+
83
+ def replace_id(old_id, new_id)
84
+ [@reactant_ids, @reagent_ids, @product_ids].each do |group|
85
+ next unless group.include?(old_id)
86
+
87
+ group.delete(old_id)
88
+ group.push(new_id)
89
+ end
90
+ end
91
+
92
+ def replace_molecule(old_id, new_mol)
93
+ oid = old_id
94
+
95
+ [@reactants, @reagents, @products].each do |group|
96
+ idx = group.index { |m| [m.id, m.clone_from].include?(old_id) }
97
+ next if idx.nil?
98
+
99
+ m = group[idx]
100
+ oid = m.clone_from unless m.clone_from.nil?
101
+ group[idx] = new_mol
102
+ end
103
+
104
+ replace_id(oid, new_mol.id)
105
+ end
106
+
107
+ def delete_molecule_by_id(id)
108
+ [@reactants, @reagents, @products].each do |group|
109
+ group.delete_if { |mol| mol.id == id }
110
+ end
111
+ end
112
+
113
+ def status
114
+ return "Failed" if @arrow.cross
115
+ return "Planned" if @arrow.line_type == 1
116
+
117
+ return "Failed" unless @products.detect(&:check_red).nil?
118
+
119
+ "Succesful"
120
+ end
121
+
122
+ def clone
123
+ cloned = self.class.new
124
+ unless @arrow.nil?
125
+ cloned.arrow_id = @arrow.get_tempid
126
+ cloned.arrow = @arrow.clone
127
+ end
128
+
129
+ %w[reactant reagent product].each do |group|
130
+ cloned_groups = cloned.send("#{group}s")
131
+ groups = instance_variable_get("@#{group}s")
132
+
133
+ groups.each { |m| cloned_groups.push(m.clone) }
134
+ cloned.send("#{group}_ids=", cloned_groups.map(&:id))
135
+ end
136
+
137
+ cloned.reagent_smiles = @reagent_smiles.dup
138
+
139
+ cloned.description = @description.dup
140
+ cloned.temperature = @temperature.dup
141
+ cloned.yield = @yield.dup
142
+ cloned.time = @time.dup
143
+ cloned.details = @details.dup
144
+
145
+ cloned.clone_from = @clone_from.nil? ? arrow_id : @clone_from
146
+
147
+ cloned
148
+ end
149
+
150
+ def to_hash
151
+ {
152
+ id: arrow_id,
153
+ reactants: @reactants.sort_by(&:cano_smiles).map(&:to_hash),
154
+ reagents: @reagents.sort_by(&:cano_smiles).map(&:to_hash),
155
+ products: @products.sort_by(&:cano_smiles).map(&:to_hash),
156
+ steps: @steps.map(&:to_hash),
157
+ reagent_smiles: reagent_smiles.sort,
158
+ description: @description,
159
+ temperature: @temperature,
160
+ yield: @yield,
161
+ time: @time,
162
+ details: @details.to_h,
163
+ }
164
+ end
165
+
166
+ def inspect
167
+ (
168
+ "#<Reaction: id=#{@arrow.id}, " +
169
+ "reactant_ids=#{@reactant_ids}, " +
170
+ "reagent_ids=#{@reagent_ids}, " +
171
+ "product_ids=#{@product_ids}, " +
172
+ "text_ids=#{@text_ids}, " +
173
+ "reactants=#{@reactants}, " +
174
+ "reagents=#{@reagents}, " +
175
+ "products=#{@products}, " +
176
+ "reagent_smiles=#{@reagent_smiles}, " +
177
+ "description=#{@description}, " +
178
+ "temperature=#{@temperature}, " +
179
+ "yield=#{@yield}, " +
180
+ "time=#{@time}, " +
181
+ "details=#{@details} >"
182
+ )
183
+ end
184
+ end
185
+ end
186
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ module Interpreter
5
+ # Reaction Step
6
+ class ReactionStep
7
+ attr_accessor :description, :time, :temperature, :reagents, :number
8
+
9
+ def initialize
10
+ @number = 0
11
+ @description = ""
12
+ @time = ""
13
+ @temperature = ""
14
+
15
+ @reagents = []
16
+ end
17
+
18
+ def inspect
19
+ (
20
+ "#<ReactionStep: description=#{@description}, " +
21
+ "number=#{@number}, " +
22
+ "time=#{@time}, " +
23
+ "temperature=#{@temperature}, " +
24
+ "reagents=#{@reagents}"
25
+ )
26
+ end
27
+
28
+ def to_hash
29
+ {
30
+ number: @number,
31
+ description: @description,
32
+ time: @time,
33
+ temperature: @temperature,
34
+ reagents: @reagents,
35
+ }
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,75 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ module ChemScanner
5
+ module Interpreter
6
+ OPEN_MARK = '[\(\[\{]'.freeze
7
+ CLOSE_MARK = '[\)\]\}]'.freeze
8
+
9
+ # NOTE: WIP file
10
+ def mol_from_inorganic_formula(text)
11
+ return nil unless text.class == String
12
+
13
+ string = text.dup
14
+ iter = string =~ /#{OPEN_MARK}/
15
+ return parse_formula(text) if iter.nil?
16
+
17
+ reverse_string = string.reverse
18
+ reverse_iter = reverse_string =~ /#{CLOSE_MARK}/
19
+
20
+ math_data = text.match(formula_regex)
21
+ end
22
+
23
+ def parse_formula(formula, out_valence = 0)
24
+ # NOTE: sort alphabetically then by length,
25
+ # so that C will not be catched first in Ca
26
+ el_names = ELEMENTS.map { |x| x["name"] }
27
+ els = el_names.sort_by { |a| [a[0], -a.size] }.join("|")
28
+ num = "[1-9]"
29
+ charge = "[-+]"
30
+ return nil unless formula.split(/#{els}|#{num}|#{charge}/).empty?
31
+
32
+ el_arr = formula.scan(/(#{els})(#{num}{0,2})/).map do |el, elnum|
33
+ el_info = ELEMENTS.detect { |e| e["name"] == el }
34
+ return nil if el_info.nil? || el_info["valences"][2].first.zero?
35
+
36
+ {
37
+ name: el,
38
+ num: elnum.empty? ? 1 : elnum.to_i,
39
+ valences: el_info["valences"][2],
40
+ }
41
+ end
42
+ return nil if el_arr.size == 1
43
+
44
+ # el_arr.sort_by! { |el| el[:valences].max }
45
+ fel = el_arr.first
46
+ others = el_arr[1..-1]
47
+
48
+ valence_combination = []
49
+ idx_map = others.map { |el| el[:valences].count - 1 }
50
+
51
+ fel[:valences].each do |fvalen|
52
+ idx_iter = Array.new(idx_map.size, 0)
53
+ iter = idx_iter.size - 1
54
+ stop = false
55
+
56
+ until stop do
57
+ vasum = idx_iter.reduce(0) do |sum, idx|
58
+ el_valence = others[idx][:valences]
59
+ cur_val = idx_iter[idx]
60
+ sum += el_valence[cur_val]
61
+ end
62
+
63
+ valence_combination.push(idx_iter) if (vasum + fvalen) == out_valence
64
+
65
+ if idx_iter[iter] == idx_map[iter]
66
+ stope = true if iter.zero?
67
+ iter -= 1
68
+ else
69
+ idx_iter[iter] += 1
70
+ end
71
+ end
72
+ end
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ # Interpret the parsed/extracted geometry block
5
+ module Interpreter
6
+ class Scheme
7
+ def assemble_reaction
8
+ @reactions.each do |r|
9
+ %w[reactant reagent product].each do |group|
10
+ group_ids = r.send("#{group}_ids")
11
+ groups = r.send("#{group}s")
12
+
13
+ group_ids.each do |id|
14
+ if @text_map.key?(id)
15
+ r.text_ids.push(id)
16
+ next
17
+ end
18
+
19
+ if @mol_map.key?(id)
20
+ groups.push(@mol_map[id])
21
+ next
22
+ end
23
+
24
+ @mol_group_map.select do |_, mgroup|
25
+ mgroup.molecule_ids.include?(id)
26
+ end.each do |_, mgroup|
27
+ groups.push(mgroup.molecules.first)
28
+ end
29
+ end
30
+ end
31
+
32
+ r.arrow = @arrow_map[r.arrow_id]
33
+ r.text_ids.concat(r.arrow.text_arr).uniq!
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChemScanner
4
+ module Interpreter
5
+ using Extension
6
+
7
+ module PostProcess
8
+ def replace_label_by_molecule
9
+ @reactions.each do |r|
10
+ @arrow_map[r.arrow_id].text_arr.each do |tid|
11
+ text = @text_map[tid]
12
+
13
+ bolds = text.bold_text.strip.split(ABB_DELIM).reject(&:empty?)
14
+ bolds.each do |bold|
15
+ mol = @mol_map.detect { |_, m| m.label == bold }
16
+ next if mol.nil?
17
+
18
+ mid = mol[0]
19
+ r.reagent_ids.push(mid) unless r.reagent_ids.include?(mid)
20
+ end
21
+
22
+ non_bolds = text.non_bold_text.strip.split(ABB_DELIM)
23
+ non_bolds.reject(&:empty?).each do |plain|
24
+ next if plain.length < 3 || !(plain =~ /eq(uiv)?\.?/).nil?
25
+
26
+ mol = @mol_map.detect { |_, m| m.text.strip == plain.strip }
27
+ next if mol.nil?
28
+
29
+ mid = mol[0]
30
+ r.reagent_ids.push(mid) unless r.reagent_ids.include?(mid)
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,225 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ module ChemScanner
5
+ # Interpreter of extracted/scanned information
6
+ module Interpreter
7
+ using Extension
8
+
9
+ JOIN_WORDS = %w[and with plus].freeze
10
+
11
+ START_REGEX = '(?<=\s|,|;|\n|\r|\[|\(|\.|\A|^)+'
12
+ ENDING_REGEX = '(?=\s|,|;|\n|\r|\]|\)|\.|\z|$)+'
13
+
14
+ DEGREE_REGEX = '((°\s*[CF])|(℃|℉))'
15
+ RANGE_REGEX = "(-|−|–|—|~|to|till|until)"
16
+
17
+ module PostProcess
18
+ def process_reaction_info(reaction)
19
+ descs = []
20
+ reaction.text_ids.each do |tid|
21
+ text_obj = @text_map[tid]
22
+ text = text_obj.value
23
+ descs.push(text)
24
+
25
+ mgroup = @mol_group_map[tid]
26
+ if mgroup.nil?
27
+ abb_mol = name_to_struct(text)
28
+ reaction.reagent_smiles.concat(abb_mol.values)
29
+ reaction.reagent_abbs.concat(abb_mol.keys)
30
+ else
31
+ mtext = mgroup.title.value
32
+
33
+ if mtext == text
34
+ merge_chemdraw_with_predefined(mgroup, reaction)
35
+ else
36
+ descs.push(mtext)
37
+ reaction.reagents.concat(mgroup.molecules)
38
+
39
+ abb_mol = name_to_struct(mtext)
40
+ reaction.reagent_smiles.concat(abb_mol.values)
41
+ end
42
+ end
43
+ end
44
+
45
+ temperature, ryield, time = extract_reaction_info(descs)
46
+ pyield = extract_product_yield(reaction)
47
+
48
+ reaction.temperature = temperature
49
+ reaction.yield = pyield.empty? ? ryield : pyield
50
+ reaction.time = time
51
+ reaction.description = descs.reject { |e| e.to_s.empty? }.join("\n")
52
+ end
53
+
54
+ def split_text(text)
55
+ text.split(ABB_DELIM).select { |t| t.length > 1 }
56
+ end
57
+
58
+ def name_to_struct(text)
59
+ smis = {}
60
+ remain = []
61
+ text_arr = split_text(text)
62
+
63
+ text_arr.each do |t|
64
+ smi = ChemScanner.get_abbreviation(t)
65
+
66
+ if smi.empty?
67
+ remain.push(t)
68
+ else
69
+ smis[t] = smi
70
+ end
71
+ end
72
+
73
+ unless remain.empty?
74
+ tmp = remain.join(" ")
75
+
76
+ ChemScanner.all_abbreviations.keys.select do |key|
77
+ key.include?(" ")
78
+ end.each do |abb|
79
+ next unless tmp.include?(abb)
80
+
81
+ tmp.slice!(abb)
82
+ smis[abb] = ChemScanner.get_abbreviation(abb)
83
+ end
84
+ end
85
+
86
+ smis
87
+ end
88
+
89
+ def merge_chemdraw_with_predefined(mgroup, reaction)
90
+ mtext = mgroup.title.value
91
+ abb_hash = name_to_struct(mtext)
92
+
93
+ text_arr = split_text(mtext)
94
+ text_arr.each_with_index do |text, idx|
95
+ abb_smi = abb_hash[text]
96
+
97
+ if abb_smi.nil?
98
+ mol = mgroup.molecules[idx]
99
+ reaction.reagents.push(mol) unless mol.nil?
100
+ else
101
+ reaction.reagent_smiles.push(abb_smi)
102
+ end
103
+ end
104
+ end
105
+
106
+ def extract_reaction_info(descs)
107
+ ryield = []
108
+ temperatures = []
109
+ times = []
110
+
111
+ descs.each do |desc|
112
+ dyield = extract_yield_info(desc)
113
+ ryield.push(dyield) unless dyield.empty?
114
+
115
+ temp = extract_temperature(desc)
116
+ temperatures.push(temp) unless temp.empty?
117
+
118
+ time = extract_time_info(desc)
119
+ times.push(time) unless time.empty?
120
+ end
121
+
122
+ [
123
+ temperatures.join(";"),
124
+ ryield.join(";"),
125
+ times.join(";"),
126
+ ]
127
+ end
128
+
129
+ def extract_product_yield(reaction)
130
+ pyields = []
131
+
132
+ reaction.products.each do |mol|
133
+ next if mol.text.strip.empty?
134
+
135
+ pyield = extract_yield_info(mol.text.strip)
136
+ pyields.push(pyield)
137
+ end
138
+
139
+ pyields.join(";")
140
+ end
141
+
142
+ def range_number_regex(unit_regex, can_negative)
143
+ sign = can_negative ? "(-|−|–|—)?\\s*" : ""
144
+ real_number = "(\\d+|\\d+\.\\d+)"
145
+
146
+ "#{sign}(#{real_number}\\s*#{unit_regex}?\\s*" \
147
+ "#{RANGE_REGEX})?#{real_number}\\s*#{unit_regex}"
148
+ end
149
+
150
+ def time_duration_range_regex
151
+ day = "days?|dy|d"
152
+ hour = "hours?|hrs?|h"
153
+ minute = "minutes?|mins?|m"
154
+ second = "seconds?|secs?|s"
155
+ real_number = '(\d+|\d+\.\d+)'
156
+
157
+ time_unit = "(#{day}|#{hour}|#{minute}|#{second})"
158
+ time_regex = "#{real_number}\\s*#{time_unit}"
159
+ join_words = JOIN_WORDS.join("|")
160
+ linker_regex = "(#{RANGE_REGEX}|(#{join_words}))"
161
+
162
+ %r{
163
+ #{START_REGEX}
164
+ (#{time_regex}?\s*(#{linker_regex}\s*)?(#{real_number}\s*#{time_unit}))
165
+ #{ENDING_REGEX}
166
+ }x
167
+ end
168
+
169
+ def extract_yield_info(text)
170
+ yield_regex_str = range_number_regex("%", false)
171
+ yield_regex = %r{
172
+ #{START_REGEX}
173
+ #{yield_regex_str}(?!\s*ee)
174
+ #{ENDING_REGEX}
175
+ }x
176
+
177
+ text_regex(text, yield_regex)
178
+ end
179
+
180
+ def extract_time_info(text)
181
+ time = []
182
+ text.scan(time_duration_range_regex) { |m| time << m[0] }
183
+
184
+ ovn_regex = "overnight|ovn|o/n"
185
+ ovn_regex = %r{
186
+ #{START_REGEX}
187
+ (#{ovn_regex}?)
188
+ #{ENDING_REGEX}
189
+ }xi
190
+ ovn = text_regex(text, ovn_regex)
191
+ time.push("12h ~ 20h") unless ovn.empty?
192
+
193
+ time.join(";")
194
+ end
195
+
196
+ def extract_temperature(text)
197
+ temp_regex_str = range_number_regex(DEGREE_REGEX, true)
198
+ temperature_regex = %r{
199
+ #{START_REGEX}
200
+ #{temp_regex_str}
201
+ #{ENDING_REGEX}
202
+ }x
203
+ temp = text_regex(text, temperature_regex)
204
+
205
+ rt_regex = %r{
206
+ #{START_REGEX}
207
+ r\.?t\.?
208
+ #{ENDING_REGEX}
209
+ }xi
210
+ m = text.match(rt_regex)
211
+ return temp if m.nil? || m[0].empty?
212
+
213
+ rt = "20°C ~ 25°C"
214
+ temp.empty? ? rt : "#{temp}; #{rt}"
215
+ end
216
+
217
+ def text_regex(text, regex)
218
+ m = text.match(regex)
219
+ return "" if m.nil?
220
+
221
+ m[0].strip
222
+ end
223
+ end
224
+ end
225
+ end