RubyGems - camdict - Versions diffs - 1.0.3 → 2.0.0 - Mend

camdict 1.0.3 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

checksums.yaml +4 -4
data/README.md +28 -33
data/lib/camdict/array_ext.rb +37 -0
data/lib/camdict/client.rb +133 -97
data/lib/camdict/common.rb +25 -143
data/lib/camdict/definition.rb +65 -596
data/lib/camdict/entry.rb +76 -0
data/lib/camdict/exception.rb +5 -0
data/lib/camdict/explanation.rb +29 -66
data/lib/camdict/http_client.rb +14 -10
data/lib/camdict/ipa.rb +52 -0
data/lib/camdict/pronunciation.rb +53 -0
data/lib/camdict/sentence.rb +38 -0
data/lib/camdict/string_ext.rb +141 -0
data/lib/camdict/word.rb +83 -17
data/test/debug.rb +60 -0
data/test/helper.rb +2 -0
data/test/itest_client.rb +39 -8
data/test/itest_definition.rb +24 -75
data/test/itest_entry.rb +37 -0
data/test/itest_explanation.rb +41 -20
data/test/itest_ipa.rb +105 -0
data/test/itest_pronunciation.rb +74 -0
data/test/itest_word.rb +49 -0
data/test/test_array_ext.rb +23 -0
data/test/test_client.rb +35 -42
data/test/test_common.rb +22 -78
data/test/test_explanation.rb +21 -25
data/test/test_http_client.rb +27 -13
data/test/test_string_ext.rb +95 -0
metadata +42 -7
data/test/test_definition.rb +0 -345

data/lib/camdict/entry.rb ADDED Viewed

@@ -0,0 +1,76 @@
+# frozen_string_literal: true
+require 'camdict/explanation'
+module Camdict
+  # definition entry, an entry contains all definitions for a part of speech.
+  # parsing the entry to get meanings, example sentences
+  module Entry
+    private
+    Sense = Struct.new(:part_of_speech, :category, :explanations)
+    def get_senses(html)
+      pos = pos(html)
+      html.css('.sense-block').map do |sb|
+        Sense.new(pos, category(sb), explanations(sb))
+      end
+    end
+    def category(html)
+      html.css('.guideword span').text
+    end
+    # Get explanations inside a definition block
+    def explanations(html)
+      html.css('.def-block').map { |db| Camdict::Explanation.new(db) }
+    end
+    def pos(html)
+      case where(html)
+      when 'title', 'spellvar'
+        html.css(pos_selector).first.text
+      when 'derived'
+        derived_css(html, pos_selector) { |node| return node.text }
+      end
+    end
+    def pos_selector
+      '.pos-header .pos'
+    end
+    # Return values: String, [String], nil
+    # Irregular plural, like criteria
+    def get_plural(html)
+      return unless senses.any? { |s| s.part_of_speech.include? 'noun' }
+      node = html.css(".pos-header .inf-group[type='plural'] .inf")
+      return node.text if node.size < 2
+      # fish has two
+      node.map(&:text)
+    end
+    # Simple Past, Past Participle, PRsent participle of a verb. Only irregular
+    # verbs have these values. Its struct memebers are +sp+, +pp+, +pr+.
+    Irregular = Struct.new(:sp, :pp, :pr)
+    # Return nil or Irregular struct
+    def get_irregular(html)
+      return unless senses.any? { |s| s.part_of_speech.include? 'verb' }
+      present, sp, pp = explicit_irregular(html)
+      if sp.nil? || sp.empty?
+        node = html.css('.pos-header .inf') # arise
+        sp, pp = node.map(&:text) if node.size.positive?
+      end
+      Irregular.new(sp, pp, present)
+    end
+    def explicit_irregular(html)
+      [css_text(html, irregular_selector('pres_part')),
+       css_text(html, irregular_selector('past_tense')),
+       css_text(html, irregular_selector('past_part'))]
+    end
+    def irregular_selector(tense)
+      ".pos-header .inf-group[type='#{tense}'] .inf"
+    end
+  end
+end

data/lib/camdict/exception.rb ADDED Viewed

@@ -0,0 +1,5 @@
+# frozen_string_literal: true
+module Camdict
+  class WordNotFound < StandardError; end
+end

data/lib/camdict/explanation.rb CHANGED Viewed

@@ -1,12 +1,13 @@
+# frozen_string_literal: true
 require 'camdict/common'
+require 'camdict/sentence'
 module Camdict
   # Explanation are inside the def-block node.
   class Explanation
-    # Elementary level. It's a symbol indicating the level when learnders know
+    # Elementary level. It's a symbol indicating the level when learners know
     # this meaning.
-    #   A1: Beginner,       A2: Elementary,
+    #   A1: Beginner,       A2: Elementary,
     #   B1: Intermediate,   B2: Upper-Intermediate,
     #   C1: Advanced,       C2: Proficiency
     attr_reader :level
@@ -33,96 +34,58 @@ module Camdict
     attr_reader :variant
     # Grammar code. Full list is http://dictionary.cambridge.org/help/codes.html
-    attr_reader :gc
+    attr_reader :code
     # Parse +html+ to get level, meaning, example sentences, synonym, opposite,
     # usage, grammar code, region, variant.
     def initialize(html)
-      @html = html
-      @level = get_level                      # String
-      @variant = get_variant                  # String
-      @meaning = get_meaning                  # String
-      @gc = css_text(".gcs")                  # String
-      @usage = css_text(".usage")             # String
-      @region = css_text(".region")           # String
-      @examples = get_examples                # [Sentence]
-      @synonym = get_synonym                  # String
-      @opposite = get_opposite                # String
+      @level = get_level(html)                      # String
+      @variant = get_variant(html)                  # String
+      @meaning = get_meaning(html)                  # String
+      @code = css_text(html, '.gcs')                # String
+      @usage = css_text(html, '.usage')             # String
+      @region = css_text(html, '.region')           # String
+      @examples = get_examples(html)                # [Sentence]
+      @synonym = get_synonym(html)                  # String
+      @opposite = get_opposite(html)                # String
       # todo: add usage panel - the word: somewhere.
     end
     private
     # A meaning may have a symbol representing the difficulty from A1-C2.
-    def get_level
-      css_text ".def-info .epp-xref"
+    def get_level(html)
+      css_text html, '.def-info .epp-xref'
     end
     # For an explanation, it may have a variant form word or phrase which has
     # same meaning.
-    def get_variant
-      css_text ".v[title='Variant form']"
+    def get_variant(html)
+      css_text html, ".v[title='Variant form']"
     end
     # The meaning of a word for this explanation.
-    def get_meaning
-      css_text(".def")
+    def get_meaning(html)
+      css_text(html, '.def')
     end
     # Get example sentences. Returned results are Sentence or nil.
-    def get_examples
-      nodes = @html.css(".examp")
-      unless nodes.empty?
-        @examples = nodes.map { |node|
-          Camdict::Explanation::Sentence.new(node)
-        }
-      end
+    def get_examples(html)
+      nodes = html.css('.examp')
+      return if nodes.empty?
+      @examples = nodes.map { |node| Camdict::Sentence.new(node) }
     end
     # Parse and get synonym word
-    def get_synonym
-      css_text ".entry-xref[type='Synonym'] .x-h"
+    def get_synonym(html)
+      css_text html, ".entry-xref[type='Synonym'] .x-h"
     end
     # Parse and get opposite word
-    def get_opposite
-      css_text ".entry-xref[type='Opposite'] .x-h"
+    def get_opposite(html)
+      css_text html, ".entry-xref[type='Opposite'] .x-h"
     end
     include Camdict::Common
-    # Parse the html to get the example sentence and its typical usage
-    # information associated with this sentence.
-    class Sentence
-      # Get the grammar code or usage in this sentence.
-      # It means how the word is used in this sentence.
-      # For example, a grammar code for the word -
-      # 'somewhere' is "+to infinitive". I'm looking for somewhere to eat.
-      attr_reader :usage
-      # Get one sentence inside an example block.
-      attr_reader :sentence
-      # New a sentence object from +html+ containing the eg block.
-      def initialize(html)
-        @html = html
-        @usage = get_usage
-        @sentence = get_sentence
-      end
-      private
-      # Parse html node under block gcs or usage to get its grammar code or
-      # usage info for this sentence.
-      def get_usage
-        css_text(".gcs") || css_text(".usage")
-      end
-      # Get sentence inside example block(.eg).
-      def get_sentence
-        css_text(".eg")
-      end
-      include Camdict::Common
-    end
   end
 end

data/lib/camdict/http_client.rb CHANGED Viewed

@@ -1,22 +1,26 @@
+# frozen_string_literal: true
+require 'open-uri'
 module Camdict
+  # HTTP module
   module HTTP
-    require "open-uri"
     # A default user agent string for this http client. It can be customised.
-    AGENT =
-      "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:26.0) Gecko/20100101 Firefox/26.0"
+    AGENT =
+      'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52'
+    # HTTP Client class
     class Client
       # Download a html page from a remote site, and return a Nokogiri::HTML
       # +url+ will be escaped by this method, and default +agtstr+ is AGENT.
-      def self.get_html(url, agtstr=AGENT)
-        url = URI.escape(url)
-        Nokogiri::HTML(open(url, "User-Agent"=>agtstr))
+      def self.get_html(url, agtstr = AGENT)
+        new.get_html(url, agtstr)
       end
+      # see +self.get_html+
+      def get_html(url, agtstr = AGENT)
+        url = URI(url)
+        Nokogiri::HTML(open(url, 'User-Agent' => agtstr))
+      end
     end
   end
 end

data/lib/camdict/ipa.rb ADDED Viewed

@@ -0,0 +1,52 @@
+# frozen_string_literal: true
+module Camdict
+  # IPA related methods shall be included in Camdict::Definition
+  module IPA
+    # Get the IPA
+    attr_reader :ipa
+    private
+    # Struct IPA is Internaltional Phonetic Alphabet
+    # +uk+: UK IPA;   +k+: the superscript index in the UK IPA.
+    # +us+: US IPA;   +s+: the superscript index in the US IPA.
+    IPA = Struct.new(:uk, :k, :us, :s)
+    def get_ipa(html)
+      case where(html)
+      when 'title', 'spellvar'
+        uk, uk_idx = ipa_idx(html, 'UK')
+        us, us_idx = ipa_idx(html, 'US')
+      when 'derived'
+        uk, uk_idx = derived_ipa_idx(html, 'UK')
+        us, us_idx = derived_ipa_idx(html, 'US')
+      end
+      @ipa = IPA.new(uk, uk_idx, us, us_idx)
+    end
+    def ipa_idx(html, region)
+      parse_ipa html.css(ipa_selector(region)).first
+    end
+    def derived_ipa_idx(html, region)
+      derived_css(html, ipa_selector(region)) { |node| return parse_ipa(node) }
+    end
+    def ipa_selector(region)
+      %([pron-region="#{region}"] .ipa)
+    end
+    # Parse an ipa node to get the ipa string and its superscript index
+    def parse_ipa(node)
+      position = 0
+      pindex = []
+      node&.children&.each do |c|
+        len = c.text.length
+        pindex += [position, len] if c['class'] == 'sp'
+        position += len
+      end
+      pindex = nil if pindex.empty?
+      [node&.text, pindex]
+    end
+  end
+end

data/lib/camdict/pronunciation.rb ADDED Viewed

@@ -0,0 +1,53 @@
+# frozen_string_literal: true
+module Camdict
+  # pronunciation related methods shall be included in Camdict::Definition
+  module Pronunciation
+    # Get the pronunciation
+    attr_reader :pronunciation
+    private
+    # Struct Pronunciation has two members.
+    # Each +uk+/+us+ has its own mp3/ogg links.
+    Pronunciation = Struct.new(:uk, :us)
+    # Struct Link has two members +mp3+ and +ogg+, which are the http links.
+    Link = Struct.new(:mp3, :ogg)
+    # Get the UK/US pronunciation mp3/ogg links as Struct uk:Link, us:Link
+    def get_pronunciation(html)
+      @pronunciation ||= parse_pron(html)
+    end
+    def parse_pron(html)
+      case where(html)
+      when 'title'
+        ukpron = pronunciation_node(html, 'UK')
+        uspron = pronunciation_node(html, 'US')
+      when 'derived'
+        ukpron = pronunciation_derived(html, 'UK')
+        uspron = pronunciation_derived(html, 'US')
+      end
+      Pronunciation.new(link(ukpron), link(uspron))
+    end
+    def pronunciation_node(html, region)
+      html.css(pronunciation_selector(region))
+    end
+    def pronunciation_derived(html, region)
+      derived_css(html, pronunciation_selector(region)) { |node| return node }
+    end
+    def pronunciation_selector(region)
+      %([pron-region="#{region}"] .sound)
+    end
+    # parameter +pron+ is a Nokigiri::Node
+    def link(pron)
+      return Link.new if pron.empty?
+      mp3_link = pron.attr('data-src-mp3').text
+      ogg_link = pron.attr('data-src-ogg').text
+      Link.new mp3_link, ogg_link
+    end
+  end
+end

data/lib/camdict/sentence.rb ADDED Viewed

@@ -0,0 +1,38 @@
+# frozen_string_literal: true
+require 'camdict/common'
+module Camdict
+  # Parse the html to get the example sentence and its typical usage
+  # information associated with this sentence.
+  class Sentence
+    # Get the grammar code or usage in this sentence.
+    # It means how the word is used in this sentence.
+    # For example, a grammar code for the word -
+    # 'somewhere' is "+to infinitive". I'm looking for somewhere to eat.
+    attr_reader :usage
+    # Get one sentence inside an example block.
+    attr_reader :sentence
+    # New a sentence object from +html+ containing the eg block.
+    def initialize(html)
+      @usage = get_usage(html)
+      @sentence = get_sentence(html)
+    end
+    private
+    # Parse html node under block gcs or usage to get its grammar code or
+    # usage info for this sentence.
+    def get_usage(html)
+      css_text(html, '.gcs') || css_text(html, '.usage')
+    end
+    # Get sentence inside example block(.eg).
+    def get_sentence(html)
+      css_text(html, '.eg')
+    end
+    include Camdict::Common
+  end
+end

data/lib/camdict/string_ext.rb ADDED Viewed

@@ -0,0 +1,141 @@
+# frozen_string_literal: true
+module Camdict
+  # Extention: Refine String class.
+  module StringExt
+    refine String do
+      # Test whether a String includes the +word+. It's useful while testing
+      # a variable which might be an array of phrase or just a single phrase.
+      alias_method :has?, :include?
+      # 'blow a kiss to/at sb'.flatten =>
+      #   ['blow a kiss to sb', 'blow a kiss at sb']
+      # if it doesn't include a slash, returns stripped string
+      def flatten
+        # strip & remove the space surrounding '/'
+        str = strip.gsub(%r{\s*\/\s*}, '/')
+        return str unless str.include? '/'
+        return f_semicolon(str) if str.include?(';')
+        return f_parenthese(str) if str.include? '('
+        f_convert(str)
+      end
+      private
+      # when two strings are passed in separated with ';', then separate them
+      def f_semicolon(str)
+        # workaround to bug or upgrade ruby to 2.4
+        # str.split(';').map(&:flatten).flatten
+        str.split(';').map { |s| s.flatten }.flatten
+      end
+      # when a string has round brackets meaning optional part
+      def f_parenthese(str)
+        head, bracket, tail = str.partition(/\(.*\)/)
+        return if bracket.empty?
+        ret = []
+        ret << (head.strip + tail).flatten
+        ret += f_str_in_bracket(bracket).map { |s| (head + s + tail).flatten }
+        ret.flatten
+      end
+      def f_str_in_bracket(bracket)
+        result = bracket.delete('()').flatten
+        result.is_a?(String) ? [result] : result
+      end
+      def f_convert(str)
+        b, e, j = f_alernative_index(str)
+        return unless j.positive?
+        f_combine(str, b, e, j)
+      end
+      def f_combine(str, b, e, j)
+        (0..j).map do |i|
+          if f_alter_not_start_end?(str, b, e, j)
+            f_word_not_start_end(str, b, e, i, j)
+          elsif f_alter_at_end?(str, b, e, j)
+            f_word_at_end(str, b, e, i)
+          elsif f_alter_at_start?(str, b, e, j)
+            f_word_at_start(str, b, e, i, j)
+          else str[b[i]..e[i]]
+          end
+        end
+      end
+      # alternative word is not the last word and not at the beginning
+      def f_alter_not_start_end?(str, b, e, j)
+        e[j] + 1 < str.length && b[0].positive?
+      end
+      def f_alter_at_end?(str, b, e, j)
+        e[j] + 1 == str.length && b[0].positive?
+      end
+      def f_alter_at_start?(str, b, e, j)
+        e[j] + 1 < str.length && b[0].zero?
+      end
+      def f_word_not_start_end(str, b, e, i, j)
+        f_word_at_end(str, b, e, i) + str[e[j] + 1..str.length - 1]
+      end
+      def f_word_at_end(str, b, e, i)
+        str[0..b[0] - 1] + str[b[i]..e[i]]
+      end
+      def f_word_at_start(str, b, e, i, j)
+        str[b[i]..e[i]] + str[e[j] + 1..str.length - 1]
+      end
+      def f_alernative_index(str)
+        h = f_init
+        f_alternative_loop(str, h)
+        [h[:b], h[:e], h[:j]]
+      end
+      def f_alternative_loop(str, h)
+        while h[:i] < str.length && !h[:quit]
+          case str[h[:i]]
+          # valid char in a word
+          when /[[:alnum:]\-']/ then f_update_start_end(h)
+          # char means a word has ended
+          when ' ', '!', '?', ',', '.' then f_reset_or_quit(h)
+          # 'or' separator
+          when '/' then f_include_next(h)
+          else f_raise_not_implement_error(str, h)
+          end
+          h[:i] += 1
+        end
+      end
+      def f_init
+        i = j = 0 # count of the alternative words, 'to/at' has two.
+        b = [] # b[]/e[] index of the beginning/end of alternative words
+        e = []
+        # set this flag when next word is expected an alternate word after slash
+        include_next = quit = false
+        { i: i, j: j, b: b, e: e, include_next: include_next, quit: quit }
+      end
+      def f_include_next(h)
+        h[:j] += 1
+        h[:include_next] = true
+      end
+      def f_raise_not_implement_error(str, h)
+        raise NotImplementedError, "char '#{str[h[:i]]}' found in '#{self}'."
+      end
+      def f_update_start_end(h)
+        h[:b][h[:j]] = h[:i] if h[:b][h[:j]].nil?
+        h[:e][h[:j]] = h[:i]
+      end
+      def f_reset_or_quit(h)
+        return h[:quit] = true if h[:include_next]
+        h[:b][h[:j]] = nil
+        h[:e][h[:j]] = nil
+      end
+    end
+  end
+end