juxtaparssionate 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ ZTY1Y2U2NGMzZjBmNGFhM2I2ZWE5M2VmMmZjOGJiNzI3YTY1M2Q0ZQ==
5
+ data.tar.gz: !binary |-
6
+ OTg1MTFjNGI2NDEyZDM2ZTU3OWFjNGY2YzhjN2Q3YTE1ZjM4ZjI1OQ==
7
+ SHA512:
8
+ metadata.gz: !binary |-
9
+ ZTNkODg3MGYyZGM0OGM3OTI3MzVkNGZmZTk5MTA5OTQzMjYxYmZjMmE5NzFm
10
+ OTEyMGRkOGViMTgwN2Q0YWIxMDhiNmM5ZjUzYmI2NjM1OTFjNzg5MTAwYjQ0
11
+ M2U1ZmFlN2NlZTU4MzAyMmJiNzZmZGQzNzc3ZWEyYzhkYmQ1Y2U=
12
+ data.tar.gz: !binary |-
13
+ ZGQ1YjdlMGE5MmUyNTA5MzVlOTcwZGFiMzVmMzRkMGZiNjFkZGY3NzY3YTNh
14
+ OGI1YjVlYjk0MmMzMDFkMjY4YzZmMjAwMDRjMTkzYWUxNjU5ZWZjN2E2YzA1
15
+ YzM0Y2UxNjlmMDg3NzRkMDhhNjRkM2VjMjc3ZTA4NThhY2E5NjU=
@@ -0,0 +1,5 @@
1
+ == README
2
+
3
+ {<img src="https://travis-ci.org/trosborn/passion-parse.svg?branch=master" alt="Build Status" />}[https://travis-ci.org/trosborn/passion-parse]
4
+
5
+ juxtaparssionate parses postings and juxtaposes the results
@@ -0,0 +1,43 @@
1
+ require 'sanitize'
2
+ require 'treat'
3
+
4
+ include Treat::Core::DSL
5
+
6
+ class Parser
7
+ attr_accessor :contents
8
+
9
+ def initialize file_path
10
+ @contents = document "#{file_path}"
11
+ end
12
+
13
+ def strip_tags
14
+ stripped = Sanitize.fragment @contents
15
+ stripped.split.join(' ')
16
+ end
17
+
18
+ def extract_nouns
19
+ words = @contents.apply(:chunk, :segment, :tokenize, :category)
20
+ words.nouns.map { |w| w.to_s }
21
+ end
22
+
23
+ def extract_words
24
+ chunked = strip_tags.apply(:chunk, :segment, :tokenize)
25
+ chunked.map { |w| w.to_s }
26
+ end
27
+
28
+ def count_words array
29
+ @word_count = {}
30
+ array.each do |word|
31
+ if @word_count[word] != nil
32
+ @word_count[word] += 1
33
+ else
34
+ @word_count[word] = 1
35
+ end
36
+ end
37
+ @word_count
38
+ end
39
+
40
+ def order_hash hash
41
+ hash.sort_by { |key, value| value }
42
+ end
43
+ end
@@ -0,0 +1,23 @@
1
+ require 'mechanize'
2
+ require 'nokogiri'
3
+
4
+ class Scraper
5
+ def we_work
6
+ agent = Mechanize.new
7
+ page = agent.get('https://weworkremotely.com/categories/2/jobs')
8
+ page.links_with( :href => %r{/jobs/} ).each_with_index do |link, index|
9
+ next if link.href == 'https://weworkremotely.com/jobs/new'
10
+ page = link.click
11
+ doc = page.parser
12
+ extracted_words = doc.css('title').to_s
13
+ extracted_words << doc.css('div.listing-container').to_s
14
+ save extracted_words, index
15
+ end
16
+ end
17
+
18
+ def save doc, index
19
+ File.open "postings/job_posting#{index}.txt", 'w' do |f|
20
+ f.write doc
21
+ end
22
+ end
23
+ end
@@ -0,0 +1 @@
1
+ VERSION = '0.0.1'
@@ -0,0 +1 @@
1
+ <div><ul><li><p> ninja <em>10x</em> wanted for ninja douchenozzle team</p></li></ul></div>
@@ -0,0 +1,32 @@
1
+ require 'minitest/autorun'
2
+ require_relative '../lib/parser'
3
+
4
+ class TestParser < Minitest::Test
5
+ def setup
6
+ @parser = Parser.new 'test/files/job_posting.txt'
7
+ @hash = { '10x' => 1, 'ninja' => 3, 'douchenozzle' => 2 }
8
+ end
9
+
10
+ def test_that_HTML_tags_are_stripped
11
+ assert_equal 'ninja 10x wanted for ninja douchenozzle team', @parser.strip_tags
12
+ end
13
+
14
+ def test_that_words_are_extracted
15
+ assert_equal ["ninja", "10x", "wanted", "for", "ninja", "douchenozzle", "team"], @parser.extract_words
16
+ end
17
+
18
+ def test_that_words_are_counted
19
+ array = ['10x', 'ninja', 'douchenozzle', 'ninja', 'douchenozzle', 'ninja']
20
+ assert_equal @hash, @parser.count_words(array)
21
+ end
22
+
23
+ def test_that_the_hash_is_ordered
24
+ array = [['10x', 1], ['douchenozzle', 2], ['ninja', 3]]
25
+ assert_equal array, @parser.order_hash(@hash)
26
+ end
27
+
28
+ def test_that_nouns_are_extracted
29
+ assert_equal ["team"], @parser.extract_nouns
30
+ end
31
+ end
32
+
metadata ADDED
@@ -0,0 +1,122 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: juxtaparssionate
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Thomas Osborn
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-04-13 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: treat
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ! '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ! '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rwordnet
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ! '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ! '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: mechanize
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ! '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: sanitize
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ! '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ! '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: juxtaparssionate parses and compares job postings from various popular
84
+ job boards. By comparing word frequency, sentence length, and parts-of-speech counts,
85
+ juxtaparssionate produces metrics loosely determining the quality and uniqueness
86
+ of a job posting
87
+ email: trosborn@gmail.com
88
+ executables: []
89
+ extensions: []
90
+ extra_rdoc_files: []
91
+ files:
92
+ - README.rdoc
93
+ - lib/parser.rb
94
+ - lib/scraper.rb
95
+ - lib/version.rb
96
+ - test/files/job_posting.txt
97
+ - test/test_parser.rb
98
+ homepage: http://www.github.com/trosborn/juxtaparssionate
99
+ licenses:
100
+ - MIT
101
+ metadata: {}
102
+ post_install_message:
103
+ rdoc_options: []
104
+ require_paths:
105
+ - lib
106
+ required_ruby_version: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ! '>='
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ required_rubygems_version: !ruby/object:Gem::Requirement
112
+ requirements:
113
+ - - ! '>='
114
+ - !ruby/object:Gem::Version
115
+ version: '0'
116
+ requirements: []
117
+ rubyforge_project:
118
+ rubygems_version: 2.4.5
119
+ signing_key:
120
+ specification_version: 4
121
+ summary: A tool for parsing and comparing job postings
122
+ test_files: []