camdict 1.0.3 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,76 @@
1
+ # frozen_string_literal: true
2
+ require 'camdict/explanation'
3
+
4
+ module Camdict
5
+ # definition entry, an entry contains all definitions for a part of speech.
6
+ # parsing the entry to get meanings, example sentences
7
+ module Entry
8
+ private
9
+
10
+ Sense = Struct.new(:part_of_speech, :category, :explanations)
11
+
12
+ def get_senses(html)
13
+ pos = pos(html)
14
+ html.css('.sense-block').map do |sb|
15
+ Sense.new(pos, category(sb), explanations(sb))
16
+ end
17
+ end
18
+
19
+ def category(html)
20
+ html.css('.guideword span').text
21
+ end
22
+
23
+ # Get explanations inside a definition block
24
+ def explanations(html)
25
+ html.css('.def-block').map { |db| Camdict::Explanation.new(db) }
26
+ end
27
+
28
+ def pos(html)
29
+ case where(html)
30
+ when 'title', 'spellvar'
31
+ html.css(pos_selector).first.text
32
+ when 'derived'
33
+ derived_css(html, pos_selector) { |node| return node.text }
34
+ end
35
+ end
36
+
37
+ def pos_selector
38
+ '.pos-header .pos'
39
+ end
40
+
41
+ # Return values: String, [String], nil
42
+ # Irregular plural, like criteria
43
+ def get_plural(html)
44
+ return unless senses.any? { |s| s.part_of_speech.include? 'noun' }
45
+ node = html.css(".pos-header .inf-group[type='plural'] .inf")
46
+ return node.text if node.size < 2
47
+ # fish has two
48
+ node.map(&:text)
49
+ end
50
+
51
+ # Simple Past, Past Participle, PRsent participle of a verb. Only irregular
52
+ # verbs have these values. Its struct memebers are +sp+, +pp+, +pr+.
53
+ Irregular = Struct.new(:sp, :pp, :pr)
54
+
55
+ # Return nil or Irregular struct
56
+ def get_irregular(html)
57
+ return unless senses.any? { |s| s.part_of_speech.include? 'verb' }
58
+ present, sp, pp = explicit_irregular(html)
59
+ if sp.nil? || sp.empty?
60
+ node = html.css('.pos-header .inf') # arise
61
+ sp, pp = node.map(&:text) if node.size.positive?
62
+ end
63
+ Irregular.new(sp, pp, present)
64
+ end
65
+
66
+ def explicit_irregular(html)
67
+ [css_text(html, irregular_selector('pres_part')),
68
+ css_text(html, irregular_selector('past_tense')),
69
+ css_text(html, irregular_selector('past_part'))]
70
+ end
71
+
72
+ def irregular_selector(tense)
73
+ ".pos-header .inf-group[type='#{tense}'] .inf"
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Camdict
4
+ class WordNotFound < StandardError; end
5
+ end
@@ -1,12 +1,13 @@
1
+ # frozen_string_literal: true
1
2
  require 'camdict/common'
3
+ require 'camdict/sentence'
2
4
 
3
5
  module Camdict
4
6
  # Explanation are inside the def-block node.
5
7
  class Explanation
6
-
7
- # Elementary level. It's a symbol indicating the level when learnders know
8
+ # Elementary level. It's a symbol indicating the level when learners know
8
9
  # this meaning.
9
- # A1: Beginner, A2: Elementary,
10
+ # A1: Beginner, A2: Elementary,
10
11
  # B1: Intermediate, B2: Upper-Intermediate,
11
12
  # C1: Advanced, C2: Proficiency
12
13
  attr_reader :level
@@ -33,96 +34,58 @@ module Camdict
33
34
  attr_reader :variant
34
35
 
35
36
  # Grammar code. Full list is http://dictionary.cambridge.org/help/codes.html
36
- attr_reader :gc
37
+ attr_reader :code
37
38
 
38
39
  # Parse +html+ to get level, meaning, example sentences, synonym, opposite,
39
40
  # usage, grammar code, region, variant.
40
41
  def initialize(html)
41
- @html = html
42
- @level = get_level # String
43
- @variant = get_variant # String
44
- @meaning = get_meaning # String
45
- @gc = css_text(".gcs") # String
46
- @usage = css_text(".usage") # String
47
- @region = css_text(".region") # String
48
- @examples = get_examples # [Sentence]
49
- @synonym = get_synonym # String
50
- @opposite = get_opposite # String
42
+ @level = get_level(html) # String
43
+ @variant = get_variant(html) # String
44
+ @meaning = get_meaning(html) # String
45
+ @code = css_text(html, '.gcs') # String
46
+ @usage = css_text(html, '.usage') # String
47
+ @region = css_text(html, '.region') # String
48
+ @examples = get_examples(html) # [Sentence]
49
+ @synonym = get_synonym(html) # String
50
+ @opposite = get_opposite(html) # String
51
51
  # todo: add usage panel - the word: somewhere.
52
52
  end
53
53
 
54
54
  private
55
+
55
56
  # A meaning may have a symbol representing the difficulty from A1-C2.
56
- def get_level
57
- css_text ".def-info .epp-xref"
57
+ def get_level(html)
58
+ css_text html, '.def-info .epp-xref'
58
59
  end
59
60
 
60
61
  # For an explanation, it may have a variant form word or phrase which has
61
62
  # same meaning.
62
- def get_variant
63
- css_text ".v[title='Variant form']"
63
+ def get_variant(html)
64
+ css_text html, ".v[title='Variant form']"
64
65
  end
65
66
 
66
67
  # The meaning of a word for this explanation.
67
- def get_meaning
68
- css_text(".def")
68
+ def get_meaning(html)
69
+ css_text(html, '.def')
69
70
  end
70
71
 
71
72
  # Get example sentences. Returned results are Sentence or nil.
72
- def get_examples
73
- nodes = @html.css(".examp")
74
- unless nodes.empty?
75
- @examples = nodes.map { |node|
76
- Camdict::Explanation::Sentence.new(node)
77
- }
78
- end
73
+ def get_examples(html)
74
+ nodes = html.css('.examp')
75
+ return if nodes.empty?
76
+ @examples = nodes.map { |node| Camdict::Sentence.new(node) }
79
77
  end
80
78
 
81
79
  # Parse and get synonym word
82
- def get_synonym
83
- css_text ".entry-xref[type='Synonym'] .x-h"
80
+ def get_synonym(html)
81
+ css_text html, ".entry-xref[type='Synonym'] .x-h"
84
82
  end
85
83
 
86
84
  # Parse and get opposite word
87
- def get_opposite
88
- css_text ".entry-xref[type='Opposite'] .x-h"
85
+ def get_opposite(html)
86
+ css_text html, ".entry-xref[type='Opposite'] .x-h"
89
87
  end
90
88
 
91
89
  include Camdict::Common
92
-
93
- # Parse the html to get the example sentence and its typical usage
94
- # information associated with this sentence.
95
- class Sentence
96
- # Get the grammar code or usage in this sentence.
97
- # It means how the word is used in this sentence.
98
- # For example, a grammar code for the word -
99
- # 'somewhere' is "+to infinitive". I'm looking for somewhere to eat.
100
- attr_reader :usage
101
-
102
- # Get one sentence inside an example block.
103
- attr_reader :sentence
104
-
105
- # New a sentence object from +html+ containing the eg block.
106
- def initialize(html)
107
- @html = html
108
- @usage = get_usage
109
- @sentence = get_sentence
110
- end
111
-
112
- private
113
- # Parse html node under block gcs or usage to get its grammar code or
114
- # usage info for this sentence.
115
- def get_usage
116
- css_text(".gcs") || css_text(".usage")
117
- end
118
-
119
- # Get sentence inside example block(.eg).
120
- def get_sentence
121
- css_text(".eg")
122
- end
123
-
124
- include Camdict::Common
125
- end
126
-
127
90
  end
128
91
  end
@@ -1,22 +1,26 @@
1
+ # frozen_string_literal: true
2
+ require 'open-uri'
3
+
1
4
  module Camdict
5
+ # HTTP module
2
6
  module HTTP
3
-
4
- require "open-uri"
5
-
6
7
  # A default user agent string for this http client. It can be customised.
7
- AGENT =
8
- "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:26.0) Gecko/20100101 Firefox/26.0"
8
+ AGENT =
9
+ 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52'
9
10
 
11
+ # HTTP Client class
10
12
  class Client
11
-
12
13
  # Download a html page from a remote site, and return a Nokogiri::HTML
13
14
  # +url+ will be escaped by this method, and default +agtstr+ is AGENT.
14
- def self.get_html(url, agtstr=AGENT)
15
- url = URI.escape(url)
16
- Nokogiri::HTML(open(url, "User-Agent"=>agtstr))
15
+ def self.get_html(url, agtstr = AGENT)
16
+ new.get_html(url, agtstr)
17
17
  end
18
18
 
19
+ # see +self.get_html+
20
+ def get_html(url, agtstr = AGENT)
21
+ url = URI(url)
22
+ Nokogiri::HTML(open(url, 'User-Agent' => agtstr))
23
+ end
19
24
  end
20
-
21
25
  end
22
26
  end
@@ -0,0 +1,52 @@
1
+ # frozen_string_literal: true
2
+ module Camdict
3
+ # IPA related methods shall be included in Camdict::Definition
4
+ module IPA
5
+ # Get the IPA
6
+ attr_reader :ipa
7
+
8
+ private
9
+
10
+ # Struct IPA is Internaltional Phonetic Alphabet
11
+ # +uk+: UK IPA; +k+: the superscript index in the UK IPA.
12
+ # +us+: US IPA; +s+: the superscript index in the US IPA.
13
+ IPA = Struct.new(:uk, :k, :us, :s)
14
+
15
+ def get_ipa(html)
16
+ case where(html)
17
+ when 'title', 'spellvar'
18
+ uk, uk_idx = ipa_idx(html, 'UK')
19
+ us, us_idx = ipa_idx(html, 'US')
20
+ when 'derived'
21
+ uk, uk_idx = derived_ipa_idx(html, 'UK')
22
+ us, us_idx = derived_ipa_idx(html, 'US')
23
+ end
24
+ @ipa = IPA.new(uk, uk_idx, us, us_idx)
25
+ end
26
+
27
+ def ipa_idx(html, region)
28
+ parse_ipa html.css(ipa_selector(region)).first
29
+ end
30
+
31
+ def derived_ipa_idx(html, region)
32
+ derived_css(html, ipa_selector(region)) { |node| return parse_ipa(node) }
33
+ end
34
+
35
+ def ipa_selector(region)
36
+ %([pron-region="#{region}"] .ipa)
37
+ end
38
+
39
+ # Parse an ipa node to get the ipa string and its superscript index
40
+ def parse_ipa(node)
41
+ position = 0
42
+ pindex = []
43
+ node&.children&.each do |c|
44
+ len = c.text.length
45
+ pindex += [position, len] if c['class'] == 'sp'
46
+ position += len
47
+ end
48
+ pindex = nil if pindex.empty?
49
+ [node&.text, pindex]
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,53 @@
1
+ # frozen_string_literal: true
2
+ module Camdict
3
+ # pronunciation related methods shall be included in Camdict::Definition
4
+ module Pronunciation
5
+ # Get the pronunciation
6
+ attr_reader :pronunciation
7
+
8
+ private
9
+
10
+ # Struct Pronunciation has two members.
11
+ # Each +uk+/+us+ has its own mp3/ogg links.
12
+ Pronunciation = Struct.new(:uk, :us)
13
+ # Struct Link has two members +mp3+ and +ogg+, which are the http links.
14
+ Link = Struct.new(:mp3, :ogg)
15
+
16
+ # Get the UK/US pronunciation mp3/ogg links as Struct uk:Link, us:Link
17
+ def get_pronunciation(html)
18
+ @pronunciation ||= parse_pron(html)
19
+ end
20
+
21
+ def parse_pron(html)
22
+ case where(html)
23
+ when 'title'
24
+ ukpron = pronunciation_node(html, 'UK')
25
+ uspron = pronunciation_node(html, 'US')
26
+ when 'derived'
27
+ ukpron = pronunciation_derived(html, 'UK')
28
+ uspron = pronunciation_derived(html, 'US')
29
+ end
30
+ Pronunciation.new(link(ukpron), link(uspron))
31
+ end
32
+
33
+ def pronunciation_node(html, region)
34
+ html.css(pronunciation_selector(region))
35
+ end
36
+
37
+ def pronunciation_derived(html, region)
38
+ derived_css(html, pronunciation_selector(region)) { |node| return node }
39
+ end
40
+
41
+ def pronunciation_selector(region)
42
+ %([pron-region="#{region}"] .sound)
43
+ end
44
+
45
+ # parameter +pron+ is a Nokigiri::Node
46
+ def link(pron)
47
+ return Link.new if pron.empty?
48
+ mp3_link = pron.attr('data-src-mp3').text
49
+ ogg_link = pron.attr('data-src-ogg').text
50
+ Link.new mp3_link, ogg_link
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+ require 'camdict/common'
3
+
4
+ module Camdict
5
+ # Parse the html to get the example sentence and its typical usage
6
+ # information associated with this sentence.
7
+ class Sentence
8
+ # Get the grammar code or usage in this sentence.
9
+ # It means how the word is used in this sentence.
10
+ # For example, a grammar code for the word -
11
+ # 'somewhere' is "+to infinitive". I'm looking for somewhere to eat.
12
+ attr_reader :usage
13
+
14
+ # Get one sentence inside an example block.
15
+ attr_reader :sentence
16
+
17
+ # New a sentence object from +html+ containing the eg block.
18
+ def initialize(html)
19
+ @usage = get_usage(html)
20
+ @sentence = get_sentence(html)
21
+ end
22
+
23
+ private
24
+
25
+ # Parse html node under block gcs or usage to get its grammar code or
26
+ # usage info for this sentence.
27
+ def get_usage(html)
28
+ css_text(html, '.gcs') || css_text(html, '.usage')
29
+ end
30
+
31
+ # Get sentence inside example block(.eg).
32
+ def get_sentence(html)
33
+ css_text(html, '.eg')
34
+ end
35
+
36
+ include Camdict::Common
37
+ end
38
+ end
@@ -0,0 +1,141 @@
1
+ # frozen_string_literal: true
2
+ module Camdict
3
+ # Extention: Refine String class.
4
+ module StringExt
5
+ refine String do
6
+ # Test whether a String includes the +word+. It's useful while testing
7
+ # a variable which might be an array of phrase or just a single phrase.
8
+ alias_method :has?, :include?
9
+
10
+ # 'blow a kiss to/at sb'.flatten =>
11
+ # ['blow a kiss to sb', 'blow a kiss at sb']
12
+ # if it doesn't include a slash, returns stripped string
13
+ def flatten
14
+ # strip & remove the space surrounding '/'
15
+ str = strip.gsub(%r{\s*\/\s*}, '/')
16
+ return str unless str.include? '/'
17
+ return f_semicolon(str) if str.include?(';')
18
+ return f_parenthese(str) if str.include? '('
19
+ f_convert(str)
20
+ end
21
+
22
+ private
23
+
24
+ # when two strings are passed in separated with ';', then separate them
25
+ def f_semicolon(str)
26
+ # workaround to bug or upgrade ruby to 2.4
27
+ # str.split(';').map(&:flatten).flatten
28
+ str.split(';').map { |s| s.flatten }.flatten
29
+ end
30
+
31
+ # when a string has round brackets meaning optional part
32
+ def f_parenthese(str)
33
+ head, bracket, tail = str.partition(/\(.*\)/)
34
+ return if bracket.empty?
35
+ ret = []
36
+ ret << (head.strip + tail).flatten
37
+ ret += f_str_in_bracket(bracket).map { |s| (head + s + tail).flatten }
38
+ ret.flatten
39
+ end
40
+
41
+ def f_str_in_bracket(bracket)
42
+ result = bracket.delete('()').flatten
43
+ result.is_a?(String) ? [result] : result
44
+ end
45
+
46
+ def f_convert(str)
47
+ b, e, j = f_alernative_index(str)
48
+ return unless j.positive?
49
+ f_combine(str, b, e, j)
50
+ end
51
+
52
+ def f_combine(str, b, e, j)
53
+ (0..j).map do |i|
54
+ if f_alter_not_start_end?(str, b, e, j)
55
+ f_word_not_start_end(str, b, e, i, j)
56
+ elsif f_alter_at_end?(str, b, e, j)
57
+ f_word_at_end(str, b, e, i)
58
+ elsif f_alter_at_start?(str, b, e, j)
59
+ f_word_at_start(str, b, e, i, j)
60
+ else str[b[i]..e[i]]
61
+ end
62
+ end
63
+ end
64
+
65
+ # alternative word is not the last word and not at the beginning
66
+ def f_alter_not_start_end?(str, b, e, j)
67
+ e[j] + 1 < str.length && b[0].positive?
68
+ end
69
+
70
+ def f_alter_at_end?(str, b, e, j)
71
+ e[j] + 1 == str.length && b[0].positive?
72
+ end
73
+
74
+ def f_alter_at_start?(str, b, e, j)
75
+ e[j] + 1 < str.length && b[0].zero?
76
+ end
77
+
78
+ def f_word_not_start_end(str, b, e, i, j)
79
+ f_word_at_end(str, b, e, i) + str[e[j] + 1..str.length - 1]
80
+ end
81
+
82
+ def f_word_at_end(str, b, e, i)
83
+ str[0..b[0] - 1] + str[b[i]..e[i]]
84
+ end
85
+
86
+ def f_word_at_start(str, b, e, i, j)
87
+ str[b[i]..e[i]] + str[e[j] + 1..str.length - 1]
88
+ end
89
+
90
+ def f_alernative_index(str)
91
+ h = f_init
92
+ f_alternative_loop(str, h)
93
+ [h[:b], h[:e], h[:j]]
94
+ end
95
+
96
+ def f_alternative_loop(str, h)
97
+ while h[:i] < str.length && !h[:quit]
98
+ case str[h[:i]]
99
+ # valid char in a word
100
+ when /[[:alnum:]\-']/ then f_update_start_end(h)
101
+ # char means a word has ended
102
+ when ' ', '!', '?', ',', '.' then f_reset_or_quit(h)
103
+ # 'or' separator
104
+ when '/' then f_include_next(h)
105
+ else f_raise_not_implement_error(str, h)
106
+ end
107
+ h[:i] += 1
108
+ end
109
+ end
110
+
111
+ def f_init
112
+ i = j = 0 # count of the alternative words, 'to/at' has two.
113
+ b = [] # b[]/e[] index of the beginning/end of alternative words
114
+ e = []
115
+ # set this flag when next word is expected an alternate word after slash
116
+ include_next = quit = false
117
+ { i: i, j: j, b: b, e: e, include_next: include_next, quit: quit }
118
+ end
119
+
120
+ def f_include_next(h)
121
+ h[:j] += 1
122
+ h[:include_next] = true
123
+ end
124
+
125
+ def f_raise_not_implement_error(str, h)
126
+ raise NotImplementedError, "char '#{str[h[:i]]}' found in '#{self}'."
127
+ end
128
+
129
+ def f_update_start_end(h)
130
+ h[:b][h[:j]] = h[:i] if h[:b][h[:j]].nil?
131
+ h[:e][h[:j]] = h[:i]
132
+ end
133
+
134
+ def f_reset_or_quit(h)
135
+ return h[:quit] = true if h[:include_next]
136
+ h[:b][h[:j]] = nil
137
+ h[:e][h[:j]] = nil
138
+ end
139
+ end
140
+ end
141
+ end