oald_parser 0.1.8 → 0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -14,9 +14,9 @@ module OaldParser
14
14
  end
15
15
 
16
16
  def self.create_facade
17
- downloader = PageDownloader.new('http://www.oup.com/oald-bin/web_getald7index1a.pl')
17
+ downloader = PageDownloader.new('http://www.oxfordadvancedlearnersdictionary.com/dictionary')
18
18
  parser = PageParser.new
19
- formatter = Formatter.new(lines: 15)
19
+ formatter = Formatter.new(items: 5)
20
20
  extractor = WordExtractor.new
21
21
  Facade.new(downloader, parser, formatter, extractor)
22
22
  end
@@ -6,18 +6,41 @@ module OaldParser
6
6
  @options = options
7
7
  end
8
8
 
9
- def format(content)
10
- lined_content = content.gsub(/<\s*br\s*\/*>/, "\n")
11
- text = Nokogiri::HTML(lined_content).text
12
- first_lines = first_lines(text, @options[:lines])
13
- first_lines.strip
14
- rescue
15
- nil
9
+ def format(page)
10
+ if !page.blocks.empty?
11
+ format_blocks(page.blocks)
12
+ else
13
+ format_items(page.items)
14
+ end
16
15
  end
17
16
 
18
17
  private
19
- def first_lines(text, lines)
20
- text.split("\n").first(lines).join("\n")
18
+ def format_blocks(blocks, limit = 1000)
19
+ blocks.collect do |block|
20
+ res = ''
21
+ res += block.text.upcase
22
+ res += "\n"
23
+ res += '-' * 20
24
+ res += "\n"
25
+ res += format_items(block.items)
26
+ res
27
+ end.join("\n\n")
28
+ end
29
+
30
+ def format_items(items, limit = 1000)
31
+ items.collect do |item|
32
+ res = ''
33
+ res += item.text
34
+ if !item.examples.empty?
35
+ res += "\n"
36
+ res += format_examples(item.examples)
37
+ end
38
+ res
39
+ end.join("\n\n")
40
+ end
41
+
42
+ def format_examples(examples)
43
+ examples.collect {|e| "+ #{e}"}.join("\n")
21
44
  end
22
45
  end
23
46
  end
@@ -8,10 +8,9 @@ module OaldParser
8
8
  end
9
9
 
10
10
  def download(word)
11
- url = URI.parse(@url)
12
- Net::HTTP.post_form(url, "search_word=#{word}")
11
+ url = URI.parse("#{@url}/#{word}")
12
+ Net::HTTP.get(url)
13
13
  rescue Exception => e
14
- puts e.inspect
15
14
  nil
16
15
  end
17
16
  end
@@ -1,22 +1,66 @@
1
+ require 'nokogiri'
2
+
1
3
  module OaldParser
4
+ class Page
5
+ attr_reader :blocks, :items
6
+ def initialize(blocks, items)
7
+ @blocks, @items = blocks, items
8
+ end
9
+ end
10
+
11
+ class Block
12
+ attr_reader :text, :items
13
+ def initialize(text, items)
14
+ @text, @items = text, items
15
+ end
16
+ end
17
+
18
+ class Item
19
+ attr_reader :text, :examples
20
+ def initialize(text, examples)
21
+ @text, @examples = text, examples
22
+ end
23
+ end
24
+
2
25
  class PageParser
3
26
  def parse(page)
4
- page = extract_part_without_header(page)
5
- return nil unless page
6
- page = extract_part_without_footer(page)
7
- return nil unless page
8
- page.strip
27
+ parsed = Nokogiri::HTML(page)
28
+ if blocks_on_page?(parsed)
29
+ Page.new(parse_block(parsed), [])
30
+ else
31
+ Page.new([], parse_items(parsed))
32
+ end
9
33
  end
10
34
 
11
35
  private
12
- def extract_part_without_header(page)
13
- parts = page.split(/<\/select>\s*<\/form>/i)
14
- parts.size == 2 ? parts[1] : nil
36
+ def blocks_on_page?(page)
37
+ page.css('div.sd-g').first
38
+ end
39
+
40
+ def parse_block(page)
41
+ block_nodes = page.css('div.sd-g')
42
+ block_nodes.collect do |block|
43
+ block_text = all_except(block, 'n-g')
44
+ items = parse_items(block)
45
+ Block.new(block_text, items)
46
+ end
47
+ end
48
+
49
+ def parse_items(block)
50
+ item_nodes = block.css('span.n-g')
51
+ item_nodes.collect do |item|
52
+ item_text = all_except(item, 'x-g')
53
+ example_nodes = item.css('span.x-g')
54
+ examples = example_nodes.collect{|e| e.text.strip}
55
+ Item.new(item_text, examples)
56
+ end
15
57
  end
16
58
 
17
- def extract_part_without_footer(page)
18
- parts = page.split(/<div\s+class='oald'>/i)
19
- parts.size == 2 ? parts[0] : nil
59
+ def all_except(item, class_name)
60
+ elements = item.children.find_all do |c|
61
+ !(c.name == 'span' && c[:class] == class_name)
62
+ end
63
+ elements.collect{|e|e.text}.join('').strip
20
64
  end
21
65
  end
22
66
  end
data/lib/oald_parser.rb CHANGED
@@ -2,4 +2,19 @@ require_relative 'oald_parser/facade'
2
2
  require_relative 'oald_parser/formatter'
3
3
  require_relative 'oald_parser/oald_parser_exception'
4
4
  require_relative 'oald_parser/page_downloader'
5
- require_relative 'oald_parser/page_parser'
5
+ require_relative 'oald_parser/page_parser'
6
+
7
+
8
+ #include OaldParser
9
+ #
10
+ #downloader = PageDownloader.new("http://www.oxfordadvancedlearnersdictionary.com/dictionary")
11
+ #page = downloader.download("surface")
12
+ #parser = PageParser.new
13
+ #parsed = parser.parse(page)
14
+ #formatter = Formatter.new(items: 15)
15
+ #puts formatter.format(parsed)
16
+
17
+ #class=sd-g block
18
+ #class=n-g new line
19
+ #class=x-g new list item
20
+ #class=xr-g delete
metadata CHANGED
@@ -4,9 +4,8 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
- - 1
8
- - 8
9
- version: 0.1.8
7
+ - 2
8
+ version: "0.2"
10
9
  platform: ruby
11
10
  authors:
12
11
  - Victor Savkin