makasi 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 8fc25b3e834a8133c582ec340b7fc6cec2e3f7d0
4
+ data.tar.gz: f8ef50afeed87dd090c4aff16d1bdb921b2b6820
5
+ SHA512:
6
+ metadata.gz: 74a6f9d2130d6a8f65e770400b91ae50a3baaf296b3b345e0a82e6f21743b3a3a97b82990331b983df76e9817ae031d668d10e8a3bb6afc2b0a9d48ddc145364
7
+ data.tar.gz: 7a641617aba0095d12b0149cf7d34e8a178a6290060305d56714d827681395cebcd7933fa0043a4199d62747c40589aec99edb77d7718c058047014be098684f
Binary file
@@ -0,0 +1 @@
1
+ /pkg
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gem 'asari', github: "nkumeiko/asari"
4
+
5
+ # Specify your gem's dependencies in makasi.gemspec
6
+ gemspec
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2015 nkumeiko
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,39 @@
1
+ # Makasi
2
+
3
+ Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/makasi`. To experiment with that code, run `bin/console` for an interactive prompt.
4
+
5
+ TODO: Delete this and the text above, and describe your gem
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'makasi'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install makasi
22
+
23
+ ## Usage
24
+
25
+ TODO: Write usage instructions here
26
+
27
+ ## Development
28
+
29
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
30
+
31
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
32
+
33
+ ## Contributing
34
+
35
+ 1. Fork it ( https://github.com/[my-github-username]/makasi/fork )
36
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
37
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
38
+ 4. Push to the branch (`git push origin my-new-feature`)
39
+ 5. Create a new Pull Request
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
@@ -0,0 +1,20 @@
1
+ class CloudSearchDocument
2
+ include Mongoid::Document
3
+
4
+ field :url, type: String
5
+ field :present_in_sitemap, type: Boolean
6
+ field :reindexed_at, type: DateTime, default: DateTime.new(2000, 1, 1)
7
+
8
+ index({ url: 1 }, unique: true)
9
+ index({ reindexed_at: -1 }, background: true)
10
+
11
+ validates_uniqueness_of :url
12
+
13
+ before_destroy :remove_cloudsearch_index
14
+
15
+ private
16
+
17
+ def remove_cloudsearch_index
18
+ Makasi::AsariClient.new.remove_item(url)
19
+ end
20
+ end
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "makasi"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
6
+
7
+ # Do any other automated setup that you need to do here
Binary file
@@ -0,0 +1,8 @@
1
+ require "makasi/asari_client"
2
+ require "makasi/asari_result"
3
+ require "makasi/config"
4
+ require "makasi/search_index"
5
+ require "makasi/version"
6
+
7
+ module Makasi
8
+ end
Binary file
@@ -0,0 +1,45 @@
1
+ module Makasi
2
+ class AsariClient
3
+ def add_item(id, fields)
4
+ asari.add_item(hash(id), fields)
5
+ end
6
+
7
+ def remove_item(id)
8
+ asari.remove_item(hash(id))
9
+ end
10
+
11
+ def search(query, params={})
12
+ asari.search(query, params)
13
+ end
14
+
15
+ def search_resource_ids(query, resource_type)
16
+ results = search(query, filter: {and: {resource_type: resource_type}})
17
+ results.map{ |id, r| r["resource_id"]}
18
+ end
19
+
20
+ def remove_all
21
+ loop do
22
+ items = search("lolzcat|-lolzcat")
23
+ break if items.empty?
24
+ items.each do |id, item|
25
+ asari.remove_item(id)
26
+ Rails.logger.debug "Makasi::AsariClient: item ##{id} has been removed"
27
+ end
28
+ end
29
+ end
30
+
31
+ private
32
+
33
+ def hash(str)
34
+ Digest::MD5.hexdigest(str)
35
+ end
36
+
37
+ def asari
38
+ asari = Asari.new(Makasi::Config.cloudsearch_index)
39
+ asari.api_version = Makasi::Config.cloudsearch_api_version
40
+ asari.aws_region = Makasi::Config.cloudsearch_aws_region
41
+ Asari.mode = :production
42
+ asari
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,46 @@
1
+ module Makasi
2
+ class AsariResult
3
+ include ActionView::Helpers
4
+
5
+ attr_reader :url
6
+
7
+ def initialize(asari_result, query)
8
+ @url = asari_result["url"]
9
+ @asari_result = asari_result
10
+ @query = query
11
+ end
12
+
13
+ def highlighted_url
14
+ highlight url
15
+ end
16
+
17
+ def title
18
+ highlight @asari_result["resource_name"].to_s
19
+ end
20
+
21
+ def snippet
22
+ text = HTMLEntities.new.decode strip_tags(@asari_result["content"].to_s).gsub(/\s+/, ' ')
23
+ highlight truncate(snippet_containing_query(text), length: 130)
24
+ end
25
+
26
+ private
27
+
28
+ def highlight(text)
29
+ word_regexp = Regexp.new "(#{@query.split.map{|word| Regexp.escape(word)}.join("|")})", true
30
+ text.gsub(word_regexp, "<span class='highlighted'>\\1</span>").html_safe
31
+ end
32
+
33
+ # Extracts snippet which include query string or at least a word from it
34
+ def snippet_containing_query(text)
35
+ @query.split.each do |word|
36
+ index = text.index(word)
37
+ if index && index > 70
38
+ text = text[index-70..-1]
39
+ text = "..." + text[text.index(" ")+1..text.length]
40
+ end
41
+ break if index
42
+ end
43
+ text
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,31 @@
1
+ module Makasi
2
+ class Config
3
+ def self.rails_config
4
+ Rails.configuration.x.makasi
5
+ end
6
+
7
+ def self.setup
8
+ yield(rails_config)
9
+ end
10
+
11
+ def self.cloudsearch_index
12
+ rails_config.cloudsearch_index
13
+ end
14
+
15
+ def self.sitemap_url
16
+ rails_config.sitemap_url
17
+ end
18
+
19
+ def self.website_url
20
+ rails_config.website_url
21
+ end
22
+
23
+ def self.cloudsearch_api_version
24
+ rails_config.cloudsearch_api_version.presence || "2013-01-01"
25
+ end
26
+
27
+ def self.cloudsearch_aws_region
28
+ rails_config.cloudsearch_aws_region.presence || "us-east-1"
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,145 @@
1
+ module Makasi
2
+ class SearchIndex
3
+ MAX_LITERAL_SIZE = 4095
4
+ MAX_TEXT_SIZE = 262144
5
+
6
+ def reindex
7
+ sync_db_with_sitemap
8
+
9
+ CloudSearchDocument.desc(:reindexed_at).each do |cloudsearch_doc|
10
+ html_doc = Nokogiri::HTML(load_page(cloudsearch_doc.url))
11
+
12
+ if Rails.logger.debug?
13
+ Rails.logger.debug ">>> URL: " + cloudsearch_doc.url +
14
+ "\n\tTITLE: " + title_of(html_doc) +
15
+ "\n\tCONTENT: " + content_of(html_doc)[0..300] +
16
+ "\n\tAUTHOR: " + meta_tag_for(html_doc, "author") +
17
+ "\n\tCONTENT_LANGUAGE: " + language_of(html_doc) +
18
+ "\n\tDESCRIPTION: " + meta_tag_for(html_doc, "description")[0..300] +
19
+ "\n\tKEYWORDS: " + meta_tag_for(html_doc, "keywords") +
20
+ "\n\tRESOURCE_TYPE: " + meta_tag_for(html_doc, "resource_type") +
21
+ "\n\tRESOURCE_NAME: " + resource_name_of(html_doc) +
22
+ "\n\tRESOURCE_ID: " + meta_tag_for(html_doc, "resource_id") +
23
+ "\n"
24
+ end
25
+
26
+ add_item_to_cloudsearch(cloudsearch_doc, html_doc)
27
+
28
+ cloudsearch_doc.update_attributes(reindexed_at: DateTime.now)
29
+ end
30
+ end
31
+
32
+ def add_item_to_cloudsearch(cloudsearch_doc, html_doc)
33
+ asari.add_item(cloudsearch_doc.url, {
34
+ url: cloudsearch_doc.url,
35
+ title: title_of(html_doc)[0..MAX_TEXT_SIZE],
36
+ content: content_of(html_doc)[0..MAX_TEXT_SIZE],
37
+ author: meta_tag_for(html_doc, "author")[0..MAX_TEXT_SIZE],
38
+ content_language: language_of(html_doc)[0..MAX_LITERAL_SIZE],
39
+ description: meta_tag_for(html_doc, "description")[0..MAX_TEXT_SIZE],
40
+ keywords: meta_tag_for(html_doc, "keywords").split(",").map(&:strip),
41
+ resource_type: meta_tag_for(html_doc, "resource_type")[0..MAX_TEXT_SIZE],
42
+ resource_name: resource_name_of(html_doc)[0..MAX_TEXT_SIZE],
43
+ resource_id: meta_tag_for(html_doc, "resource_id")[0..MAX_TEXT_SIZE]
44
+ })
45
+ end
46
+
47
+ def sync_db_with_sitemap
48
+ CloudSearchDocument.update_all(present_in_sitemap: false)
49
+ url_nodes = Nokogiri::XML(read_sitemap).css('url loc')
50
+
51
+ url_nodes.each do |url_node|
52
+ cloudsearch_doc = CloudSearchDocument.find_or_initialize_by(url: url_node.text)
53
+ cloudsearch_doc.update_attributes(present_in_sitemap: true)
54
+ end
55
+
56
+ if Rails.logger.debug?
57
+ Rails.logger.debug "SEARCH_INDEX: Updated #{CloudSearchDocument.where(present_in_sitemap: true).count} documents"
58
+ Rails.logger.debug "SEARCH_INDEX: Removed #{CloudSearchDocument.where(present_in_sitemap: false).count} documents"
59
+ end
60
+
61
+ CloudSearchDocument.where(present_in_sitemap: false).destroy_all
62
+ end
63
+
64
+ def load_page(url, limit = 10)
65
+ if limit == 0
66
+ Rails.logger.error "ERROR: Faild load sitemap's url #{url}"
67
+ return ""
68
+ end
69
+
70
+ ## Patch for indexing from localhost
71
+ if Rails.env.development?
72
+ url += "/" unless url.ends_with?("/")
73
+ url.gsub! Makasi::Config.website_url, "localhost:3000"
74
+ end
75
+
76
+ parsed_url = URI.parse(url)
77
+ request = Net::HTTP::Get.new(url)
78
+ response = Net::HTTP.start(parsed_url.host, parsed_url.port) { |http| http.request(request) }
79
+ case response
80
+ when Net::HTTPSuccess then response.body
81
+ when Net::HTTPRedirection then load_page(response['location'], limit - 1)
82
+ else
83
+ Rails.logger.error "Makasi::SearchIndex ERROR: Faild load sitemap's url #{url}"
84
+ return ""
85
+ end
86
+ end
87
+
88
+ def asari
89
+ @asari ||= Makasi::AsariClient.new
90
+ end
91
+
92
+ def read_sitemap
93
+ sitemap_file = open(Makasi::Config.sitemap_url)
94
+ Zlib::GzipReader.new(sitemap_file).read
95
+ end
96
+
97
+ def meta_tag_for(doc, name)
98
+ nodes = doc.css("meta[name='#{name}']")
99
+ nodes.present? ? HTMLEntities.new.decode(nodes[0]["content"].to_s.strip) : ""
100
+ end
101
+
102
+ def title_of(doc)
103
+ nodes = doc.xpath("//title")
104
+ nodes.present? ? HTMLEntities.new.decode(nodes[0].text) : ""
105
+ end
106
+
107
+ def content_of(doc)
108
+ content_nodes = doc.css("[data-indexable]")
109
+ if content_nodes.present?
110
+ extract_text(content_nodes)
111
+ else
112
+ extract_text([doc])
113
+ end
114
+ end
115
+
116
+ def language_of(doc)
117
+ nodes = doc.xpath("//html")
118
+ nodes.present? ? nodes[0]["lang"].to_s : ""
119
+ end
120
+
121
+ def extract_text(nodes)
122
+ content = StringIO.new
123
+ nodes.each do |node|
124
+ node.traverse do |child_node|
125
+ if child_node.text?
126
+ content << child_node.text
127
+ elsif child_node.name == "img"
128
+ content << child_node["alt"]
129
+ end
130
+ content << " "
131
+ end
132
+ end
133
+ HTMLEntities.new.decode content.string.gsub(/\s+/, " ").strip
134
+ end
135
+
136
+ def resource_name_of(doc)
137
+ content_nodes = doc.css("[data-title]")
138
+ if content_nodes.present?
139
+ HTMLEntities.new.decode content_nodes.map(&:text).join(" ")
140
+ else
141
+ title_of(doc)
142
+ end
143
+ end
144
+ end
145
+ end
@@ -0,0 +1,3 @@
1
+ module Makasi
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,10 @@
1
+ namespace :makasi do
2
+ task :search_reindex => [:environment, "sitemap:refresh"] do
3
+ Makasi::SearchIndex.new.reindex
4
+ end
5
+
6
+ task :search_truncate_index => :environment do
7
+ Makasi::AsariClient.new.remove_all
8
+ CloudSearchDocument.delete_all
9
+ end
10
+ end
@@ -0,0 +1,23 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'makasi/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "makasi"
8
+ spec.version = Makasi::VERSION
9
+ spec.authors = ["Nataliia Kumeiko"]
10
+ spec.email = ["nkumeiko@gmail.com"]
11
+
12
+ spec.summary = "An easy way to index sitemap and search through it. Based on Amazon CloudSearch."
13
+ spec.homepage = "http://slatestudio.com"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
17
+ spec.bindir = "exe"
18
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.8"
22
+ spec.add_development_dependency "rake", "~> 10.0"
23
+ end
metadata ADDED
@@ -0,0 +1,91 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: makasi
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Nataliia Kumeiko
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2015-07-07 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.8'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.8'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ description:
42
+ email:
43
+ - nkumeiko@gmail.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - ".DS_Store"
49
+ - ".gitignore"
50
+ - Gemfile
51
+ - LICENSE.txt
52
+ - README.md
53
+ - Rakefile
54
+ - app/models/cloud_search_document.rb
55
+ - bin/console
56
+ - bin/setup
57
+ - lib/.DS_Store
58
+ - lib/makasi.rb
59
+ - lib/makasi/.DS_Store
60
+ - lib/makasi/asari_client.rb
61
+ - lib/makasi/asari_result.rb
62
+ - lib/makasi/config.rb
63
+ - lib/makasi/search_index.rb
64
+ - lib/makasi/version.rb
65
+ - lib/tasks/search_reindex.rake
66
+ - makasi.gemspec
67
+ homepage: http://slatestudio.com
68
+ licenses:
69
+ - MIT
70
+ metadata: {}
71
+ post_install_message:
72
+ rdoc_options: []
73
+ require_paths:
74
+ - lib
75
+ required_ruby_version: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: '0'
80
+ required_rubygems_version: !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - ">="
83
+ - !ruby/object:Gem::Version
84
+ version: '0'
85
+ requirements: []
86
+ rubyforge_project:
87
+ rubygems_version: 2.4.6
88
+ signing_key:
89
+ specification_version: 4
90
+ summary: An easy way to index sitemap and search through it. Based on Amazon CloudSearch.
91
+ test_files: []