oald_parser 0.1.8 → 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,9 +14,9 @@ module OaldParser
14
14
  end
15
15
 
16
16
  def self.create_facade
17
- downloader = PageDownloader.new('http://www.oup.com/oald-bin/web_getald7index1a.pl')
17
+ downloader = PageDownloader.new('http://www.oxfordadvancedlearnersdictionary.com/dictionary')
18
18
  parser = PageParser.new
19
- formatter = Formatter.new(lines: 15)
19
+ formatter = Formatter.new(items: 5)
20
20
  extractor = WordExtractor.new
21
21
  Facade.new(downloader, parser, formatter, extractor)
22
22
  end
@@ -6,18 +6,41 @@ module OaldParser
6
6
  @options = options
7
7
  end
8
8
 
9
- def format(content)
10
- lined_content = content.gsub(/<\s*br\s*\/*>/, "\n")
11
- text = Nokogiri::HTML(lined_content).text
12
- first_lines = first_lines(text, @options[:lines])
13
- first_lines.strip
14
- rescue
15
- nil
9
+ def format(page)
10
+ if !page.blocks.empty?
11
+ format_blocks(page.blocks)
12
+ else
13
+ format_items(page.items)
14
+ end
16
15
  end
17
16
 
18
17
  private
19
- def first_lines(text, lines)
20
- text.split("\n").first(lines).join("\n")
18
+ def format_blocks(blocks, limit = 1000)
19
+ blocks.collect do |block|
20
+ res = ''
21
+ res += block.text.upcase
22
+ res += "\n"
23
+ res += '-' * 20
24
+ res += "\n"
25
+ res += format_items(block.items)
26
+ res
27
+ end.join("\n\n")
28
+ end
29
+
30
+ def format_items(items, limit = 1000)
31
+ items.collect do |item|
32
+ res = ''
33
+ res += item.text
34
+ if !item.examples.empty?
35
+ res += "\n"
36
+ res += format_examples(item.examples)
37
+ end
38
+ res
39
+ end.join("\n\n")
40
+ end
41
+
42
+ def format_examples(examples)
43
+ examples.collect {|e| "+ #{e}"}.join("\n")
21
44
  end
22
45
  end
23
46
  end
@@ -8,10 +8,9 @@ module OaldParser
8
8
  end
9
9
 
10
10
  def download(word)
11
- url = URI.parse(@url)
12
- Net::HTTP.post_form(url, "search_word=#{word}")
11
+ url = URI.parse("#{@url}/#{word}")
12
+ Net::HTTP.get(url)
13
13
  rescue Exception => e
14
- puts e.inspect
15
14
  nil
16
15
  end
17
16
  end
@@ -1,22 +1,66 @@
1
+ require 'nokogiri'
2
+
1
3
  module OaldParser
4
+ class Page
5
+ attr_reader :blocks, :items
6
+ def initialize(blocks, items)
7
+ @blocks, @items = blocks, items
8
+ end
9
+ end
10
+
11
+ class Block
12
+ attr_reader :text, :items
13
+ def initialize(text, items)
14
+ @text, @items = text, items
15
+ end
16
+ end
17
+
18
+ class Item
19
+ attr_reader :text, :examples
20
+ def initialize(text, examples)
21
+ @text, @examples = text, examples
22
+ end
23
+ end
24
+
2
25
  class PageParser
3
26
  def parse(page)
4
- page = extract_part_without_header(page)
5
- return nil unless page
6
- page = extract_part_without_footer(page)
7
- return nil unless page
8
- page.strip
27
+ parsed = Nokogiri::HTML(page)
28
+ if blocks_on_page?(parsed)
29
+ Page.new(parse_block(parsed), [])
30
+ else
31
+ Page.new([], parse_items(parsed))
32
+ end
9
33
  end
10
34
 
11
35
  private
12
- def extract_part_without_header(page)
13
- parts = page.split(/<\/select>\s*<\/form>/i)
14
- parts.size == 2 ? parts[1] : nil
36
+ def blocks_on_page?(page)
37
+ page.css('div.sd-g').first
38
+ end
39
+
40
+ def parse_block(page)
41
+ block_nodes = page.css('div.sd-g')
42
+ block_nodes.collect do |block|
43
+ block_text = all_except(block, 'n-g')
44
+ items = parse_items(block)
45
+ Block.new(block_text, items)
46
+ end
47
+ end
48
+
49
+ def parse_items(block)
50
+ item_nodes = block.css('span.n-g')
51
+ item_nodes.collect do |item|
52
+ item_text = all_except(item, 'x-g')
53
+ example_nodes = item.css('span.x-g')
54
+ examples = example_nodes.collect{|e| e.text.strip}
55
+ Item.new(item_text, examples)
56
+ end
15
57
  end
16
58
 
17
- def extract_part_without_footer(page)
18
- parts = page.split(/<div\s+class='oald'>/i)
19
- parts.size == 2 ? parts[0] : nil
59
+ def all_except(item, class_name)
60
+ elements = item.children.find_all do |c|
61
+ !(c.name == 'span' && c[:class] == class_name)
62
+ end
63
+ elements.collect{|e|e.text}.join('').strip
20
64
  end
21
65
  end
22
66
  end
data/lib/oald_parser.rb CHANGED
@@ -2,4 +2,19 @@ require_relative 'oald_parser/facade'
2
2
  require_relative 'oald_parser/formatter'
3
3
  require_relative 'oald_parser/oald_parser_exception'
4
4
  require_relative 'oald_parser/page_downloader'
5
- require_relative 'oald_parser/page_parser'
5
+ require_relative 'oald_parser/page_parser'
6
+
7
+
8
+ #include OaldParser
9
+ #
10
+ #downloader = PageDownloader.new("http://www.oxfordadvancedlearnersdictionary.com/dictionary")
11
+ #page = downloader.download("surface")
12
+ #parser = PageParser.new
13
+ #parsed = parser.parse(page)
14
+ #formatter = Formatter.new(items: 15)
15
+ #puts formatter.format(parsed)
16
+
17
+ #class=sd-g block
18
+ #class=n-g new line
19
+ #class=x-g new list item
20
+ #class=xr-g delete
metadata CHANGED
@@ -4,9 +4,8 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
- - 1
8
- - 8
9
- version: 0.1.8
7
+ - 2
8
+ version: "0.2"
10
9
  platform: ruby
11
10
  authors:
12
11
  - Victor Savkin